diff options
Diffstat (limited to 'arch/x86/entry')
30 files changed, 1640 insertions, 1070 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..72cae8e0ce85 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -7,17 +7,20 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o -obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o -obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o -obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o +obj-$(CONFIG_PREEMPTION) += thunk.o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f6907627172b..77e2d920a640 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,11 +65,13 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */ + /* We just clobbered the return address - use the IRET frame for unwinding: */ + UNWIND_HINT_IRET_REGS offset=3*8 .else pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -87,14 +89,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_callee=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -108,18 +113,19 @@ For 32-bit we have the following conventions - kernel is built with xorl %r9d, %r9d /* nospec r9 */ xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ + .if \clear_callee xorl %ebx, %ebx /* nospec rbx */ xorl %ebp, %ebp /* nospec rbp */ xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ xorl %r15d, %r15d /* nospec r15 */ - + .endif .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_callee=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_callee=\clear_callee .endm .macro POP_REGS pop_rdi=1 @@ -142,10 +148,10 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ #define PTI_USER_PGTABLE_BIT PAGE_SHIFT @@ -160,7 +166,7 @@ For 32-bit we have the following conventions - kernel is built with .macro ADJUST_KERNEL_CR3 reg:req ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID - /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg .endm @@ -173,10 +179,9 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) -.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req - ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI +.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req mov %cr3, \scratch_reg ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID @@ -206,13 +211,20 @@ For 32-bit we have the following conventions - kernel is built with /* Flip the PGD to the user version */ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg mov \scratch_reg, %cr3 +.endm + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2 .Lend_\@: .endm .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI pushq %rax - SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax popq %rax +.Lend_\@: .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req @@ -233,17 +245,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -251,25 +265,17 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm -#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .endm @@ -279,7 +285,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif @@ -297,7 +303,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -326,7 +332,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -362,7 +368,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro STACKLEAK_ERASE_NOCLOBBER -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE PUSH_AND_CLEAR_REGS call stackleak_erase POP_REGS @@ -381,7 +387,7 @@ For 32-bit we have the following conventions - kernel is built with #endif /* !CONFIG_X86_64 */ .macro STACKLEAK_ERASE -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE call stackleak_erase #endif .endm @@ -420,3 +426,64 @@ For 32-bit we have the following conventions - kernel is built with .endm #endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_64 + +/* rdi: arg1 ... normal C conventions. rax is saved/restored. */ +.macro THUNK name, func +SYM_FUNC_START(\name) + ANNOTATE_NOENDBR + pushq %rbp + movq %rsp, %rbp + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + call \func + + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rbp + RET +SYM_FUNC_END(\name) + _ASM_NOKPROBE(\name) +.endm + +#else /* CONFIG_X86_32 */ + +/* put return address in eax (arg1) */ +.macro THUNK name, func, put_ret_addr_in_eax=0 +SYM_CODE_START_NOALIGN(\name) + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + RET + _ASM_NOKPROBE(\name) +SYM_CODE_END(\name) + .endm + +#endif diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c deleted file mode 100644 index 6c2826417b33..000000000000 --- a/arch/x86/entry/common.c +++ /dev/null @@ -1,320 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * common.c - C code for kernel entry and exit - * Copyright (c) 2015 Andrew Lutomirski - * - * Based on asm and ptrace code by many authors. The code here originated - * in ptrace.c and signal.c. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/entry-common.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/export.h> -#include <linux/nospec.h> -#include <linux/syscalls.h> -#include <linux/uaccess.h> - -#ifdef CONFIG_XEN_PV -#include <xen/xen-ops.h> -#include <xen/events.h> -#endif - -#include <asm/desc.h> -#include <asm/traps.h> -#include <asm/vdso.h> -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/nospec-branch.h> -#include <asm/io_bitmap.h> -#include <asm/syscall.h> -#include <asm/irq_stack.h> - -#ifdef CONFIG_X86_64 - -static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) -{ - /* - * Convert negative numbers to very high and thus out of range - * numbers for comparisons. - */ - unsigned int unr = nr; - - if (likely(unr < NR_syscalls)) { - unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); - return true; - } - return false; -} - -static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) -{ - /* - * Adjust the starting offset of the table, and convert numbers - * < __X32_SYSCALL_BIT to very high and thus out of range - * numbers for comparisons. - */ - unsigned int xnr = nr - __X32_SYSCALL_BIT; - - if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { - xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); - return true; - } - return false; -} - -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) -{ - add_random_kstack_offset(); - nr = syscall_enter_from_user_mode(regs, nr); - - instrumentation_begin(); - - if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { - /* Invalid system call, but still a system call. */ - regs->ax = __x64_sys_ni_syscall(regs); - } - - instrumentation_end(); - syscall_exit_to_user_mode(regs); -} -#endif - -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static __always_inline int syscall_32_enter(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_IA32_EMULATION)) - current_thread_info()->status |= TS_COMPAT; - - return (int)regs->orig_ax; -} - -/* - * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. - */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) -{ - /* - * Convert negative numbers to very high and thus out of range - * numbers for comparisons. - */ - unsigned int unr = nr; - - if (likely(unr < IA32_NR_syscalls)) { - unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); - } else if (nr != -1) { - regs->ax = __ia32_sys_ni_syscall(regs); - } -} - -/* Handles int $0x80 */ -__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) -{ - int nr = syscall_32_enter(regs); - - add_random_kstack_offset(); - /* - * Subtlety here: if ptrace pokes something larger than 2^31-1 into - * orig_ax, the int return value truncates it. This matches - * the semantics of syscall_get_nr(). - */ - nr = syscall_enter_from_user_mode(regs, nr); - instrumentation_begin(); - - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); -} - -static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) -{ - int nr = syscall_32_enter(regs); - int res; - - add_random_kstack_offset(); - /* - * This cannot use syscall_enter_from_user_mode() as it has to - * fetch EBP before invoking any of the syscall entry work - * functions. - */ - syscall_enter_from_user_mode_prepare(regs); - - instrumentation_begin(); - /* Fetch EBP from where the vDSO stashed it. */ - if (IS_ENABLED(CONFIG_X86_64)) { - /* - * Micro-optimization: the pointer we're following is - * explicitly 32 bits, so it can't be out of range. - */ - res = __get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp); - } else { - res = get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp); - } - - if (res) { - /* User code screwed up. */ - regs->ax = -EFAULT; - - local_irq_disable(); - instrumentation_end(); - irqentry_exit_to_user_mode(regs); - return false; - } - - nr = syscall_enter_from_user_mode_work(regs, nr); - - /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); - return true; -} - -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) -{ - /* - * Called using the internal vDSO SYSENTER/SYSCALL32 calling - * convention. Adjust regs so it looks like we entered using int80. - */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + - vdso_image_32.sym_int80_landing_pad; - - /* - * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward - * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. - * Fix it up. - */ - regs->ip = landing_pad; - - /* Invoke the syscall. If it failed, keep it simple: use IRET. */ - if (!__do_fast_syscall_32(regs)) - return 0; - -#ifdef CONFIG_X86_64 - /* - * Opportunistic SYSRETL: if possible, try to return using SYSRETL. - * SYSRETL is available on all 64-bit CPUs, so we don't need to - * bother with SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - */ - return regs->cs == __USER32_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; -#else - /* - * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. - * - * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, - * because the ECX fixup above will ensure that this is essentially - * never the case. - * - * We don't allow syscalls at all from VM86 mode, but we still - * need to check VM, because we might be returning from sys_vm86. - */ - return static_cpu_has(X86_FEATURE_SEP) && - regs->cs == __USER_CS && regs->ss == __USER_DS && - regs->ip == landing_pad && - (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; -#endif -} - -/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) -{ - /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ - regs->sp = regs->bp; - - /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ - regs->flags |= X86_EFLAGS_IF; - - return do_fast_syscall_32(regs); -} -#endif - -SYSCALL_DEFINE0(ni_syscall) -{ - return -ENOSYS; -} - -#ifdef CONFIG_XEN_PV -#ifndef CONFIG_PREEMPTION -/* - * Some hypercalls issued by the toolstack can take many 10s of - * seconds. Allow tasks running hypercalls via the privcmd driver to - * be voluntarily preempted even if full kernel preemption is - * disabled. - * - * Such preemptible hypercalls are bracketed by - * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() - * calls. - */ -DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); - -/* - * In case of scheduling the flag must be cleared and restored after - * returning from schedule as the task might move to a different CPU. - */ -static __always_inline bool get_and_clear_inhcall(void) -{ - bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); - - __this_cpu_write(xen_in_preemptible_hcall, false); - return inhcall; -} - -static __always_inline void restore_inhcall(bool inhcall) -{ - __this_cpu_write(xen_in_preemptible_hcall, inhcall); -} -#else -static __always_inline bool get_and_clear_inhcall(void) { return false; } -static __always_inline void restore_inhcall(bool inhcall) { } -#endif - -static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - inc_irq_stat(irq_hv_callback_count); - - xen_hvm_evtchn_do_upcall(); - - set_irq_regs(old_regs); -} - -__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) -{ - irqentry_state_t state = irqentry_enter(regs); - bool inhcall; - - instrumentation_begin(); - run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); - - inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { - irqentry_exit_cond_resched(); - instrumentation_end(); - restore_inhcall(inhcall); - } else { - instrumentation_end(); - irqentry_exit(regs, state); - } -} -#endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index bfb7bcb362bc..6ba2b3adcef0 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -3,20 +3,75 @@ * Common place for both 32- and 64-bit entry routines. */ +#include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/linkage.h> -#include <asm/export.h> +#include <linux/objtool.h> #include <asm/msr-index.h> +#include <asm/unwind_hints.h> +#include <asm/segment.h> +#include <asm/cache.h> +#include <asm/cpufeatures.h> +#include <asm/nospec-branch.h> + +#include "calling.h" .pushsection .noinstr.text, "ax" -SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) + ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx - movl $PRED_CMD_IBPB, %eax + movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx wrmsr + + /* Make sure IBPB clears return stack preductions too. */ + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET + RET +SYM_FUNC_END(write_ibpb) +EXPORT_SYMBOL_FOR_KVM(write_ibpb); + +SYM_FUNC_START(__WARN_trap) + ANNOTATE_NOENDBR + ANNOTATE_REACHABLE + ud1 (%edx), %_ASM_ARG1 RET -SYM_FUNC_END(entry_ibpb) -/* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +SYM_FUNC_END(__WARN_trap) +EXPORT_SYMBOL(__WARN_trap) + +.popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensures VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(x86_verw_sel) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(x86_verw_sel); +EXPORT_SYMBOL_FOR_KVM(x86_verw_sel); .popsection + +THUNK warn_thunk_thunk, __warn_thunk + +/* + * Clang's implementation of TLS stack cookies requires the variable in + * question to be a TLS variable. If the variable happens to be defined as an + * ordinary variable with external linkage in the same compilation unit (which + * amounts to the whole of vmlinux with LTO enabled), Clang will drop the + * segment register prefix from the references, resulting in broken code. Work + * around this by avoiding the symbol used in -mstack-protector-guard-symbol= + * entirely in the C code, and use an alias emitted by the linker script + * instead. + */ +#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP) +EXPORT_SYMBOL(__ref_stack_chk_guard); +#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 6e6af42e044a..92c0b4a94e0a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -305,7 +305,7 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET) ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers @@ -837,7 +833,7 @@ SYM_FUNC_START(entry_SYSENTER_32) movl %esp, %eax call do_SYSENTER_32 - testl %eax, %eax + testb %al, %al jz .Lsyscall_32_done STACKLEAK_ERASE @@ -875,6 +871,8 @@ SYM_FUNC_START(entry_SYSENTER_32) /* Now ready to switch the cr3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax + /* Clobbers ZF */ + CLEAR_CPU_BUFFERS /* * Restore all flags except IF. (We restore IF separately because @@ -954,6 +952,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1154,7 +1153,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1166,6 +1165,7 @@ SYM_CODE_START(asm_exc_nmi) CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 + CLEAR_CPU_BUFFERS jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1207,6 +1207,7 @@ SYM_CODE_START(asm_exc_nmi) * 1 - orig_ax */ lss (1+5+6)*4(%esp), %esp # back to espfix stack + CLEAR_CPU_BUFFERS jmp .Lirq_return #endif SYM_CODE_END(asm_exc_nmi) @@ -1216,7 +1217,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 43606de22511..f9983a1907bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -18,6 +18,8 @@ * - SYM_FUNC_START/END:Define functions in the symbol table. * - idtentry: Define exception entry points. */ +#include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/cache.h> @@ -34,7 +36,6 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/pgtable_types.h> -#include <asm/export.h> #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> @@ -92,7 +93,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -116,6 +117,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -126,70 +128,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * In the Xen PV case we must use iret anyway. */ - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ - X86_FEATURE_XENPV - - movq RCX(%rsp), %rcx - movq RIP(%rsp), %r11 - - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * If width of "canonical tail" ever becomes variable, this will need - * to be updated to remain correct on both old and new CPUs. - * - * Change top bits to match most significant bit (47th or 56th bit - * depending on paging mode) in the address. - */ -#ifdef CONFIG_X86_5LEVEL - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 -#else - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -#endif - - /* If this changed %rcx, it was not canonical */ - cmpq %rcx, %r11 - jne swapgs_restore_regs_and_return_to_usermode - - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode - - movq R11(%rsp), %r11 - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne swapgs_restore_regs_and_return_to_usermode - - /* - * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot - * restore RF properly. If the slowpath sets it for whatever reason, we - * need to restore it correctly. - * - * SYSRET can restore TF, but unlike IRET, restoring TF results in a - * trap from userspace immediately after SYSRET. This would cause an - * infinite loop whenever #DB happens with register state that satisfies - * the opportunistic SYSRET conditions. For example, single-stepping - * this user code: - * - * movq $stuck_here, %rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz swapgs_restore_regs_and_return_to_usermode - - /* nothing to check for RSP */ - - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne swapgs_restore_regs_and_return_to_usermode + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding @@ -223,6 +163,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -235,6 +176,7 @@ SYM_CODE_END(entry_SYSCALL_64) */ .pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) + ANNOTATE_NOENDBR /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -252,7 +194,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(__stack_chk_guard) #endif /* @@ -309,7 +251,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection @@ -362,10 +310,9 @@ SYM_CODE_END(xen_error_entry) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - call \cfunc - /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp error_return .endm @@ -432,14 +379,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -591,10 +530,10 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer into first argument */ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp paranoid_exit @@ -621,17 +560,28 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) IBRS_EXIT -#ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ - testb $3, CS(%rsp) - jnz 1f - ud2 -1: -#endif #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI +#endif + + STACKLEAK_ERASE + POP_REGS + add $8, %rsp /* orig_ax */ + UNWIND_HINT_IRET_REGS + +.Lswapgs_and_iret: + swapgs + CLEAR_CPU_BUFFERS + /* Assert that the IRET frame indicates user mode. */ + testb $3, 8(%rsp) + jnz .Lnative_iret + ud2 +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +.Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 /* @@ -658,13 +608,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + push %rax + SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax + pop %rax /* Restore RDI. */ popq %rdi - swapgs - jmp .Lnative_iret - + jmp .Lswapgs_and_iret +#endif SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY @@ -774,6 +725,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -791,6 +744,7 @@ _ASM_NOKPROBE(common_interrupt_return) * Is in entry.text as it shouldn't be instrumented. */ SYM_FUNC_START(asm_load_gs_index) + ANNOTATE_NOENDBR FRAME_BEGIN swapgs .Lgs_change: @@ -1019,14 +973,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1147,7 +1101,7 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_ENTRY @@ -1163,8 +1117,8 @@ SYM_CODE_START(asm_exc_nmi) * anyway. * * To handle this case we do the following: - * Check the a special location on the stack that contains - * a variable that is set when NMIs are executing. + * Check a special location on the stack that contains a + * variable that is set when NMIs are executing. * The interrupted task's stack is also checked to see if it * is an NMI stack. * If the variable is not set and the stack is not the NMI @@ -1215,7 +1169,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1237,7 +1191,6 @@ SYM_CODE_START(asm_exc_nmi) */ movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* @@ -1295,8 +1248,8 @@ SYM_CODE_START(asm_exc_nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call exc_nmi() anyway, so we can just - * resume the outer NMI. + * about to call exc_nmi() anyway, so we can just resume + * the outer NMI. */ movq $repeat_nmi, %rdx @@ -1451,14 +1404,12 @@ end_repeat_nmi: UNWIND_HINT_REGS movq %rsp, %rdi - movq $-1, %rsi call exc_nmi /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE @@ -1503,6 +1454,12 @@ nmi_restore: movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this * cannot result in a fault. Similarly, we don't need to worry @@ -1511,18 +1468,17 @@ nmi_restore: iretq SYM_CODE_END(asm_exc_nmi) -#ifndef CONFIG_IA32_EMULATION /* * This handles SYSCALL from 32-bit code. There is no way to program * MSRs to fully disable 32-bit SYSCALL. */ -SYM_CODE_START(ignore_sysret) +SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl -SYM_CODE_END(ignore_sysret) -#endif +SYM_CODE_END(entry_SYSCALL32_ignore) .pushsection .text, "ax" __FUNC_ALIGN @@ -1531,10 +1487,85 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. + */ +SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f +.Lret1: RET + .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b +.Lret2: RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 70150298f8bd..a45e1125fc6c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -7,7 +7,6 @@ #include <asm/asm-offsets.h> #include <asm/current.h> #include <asm/errno.h> -#include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> #include <asm/irqflags.h> @@ -58,7 +57,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ @@ -90,9 +89,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) cld - IBRS_ENTER - UNTRAIN_RET - /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -116,11 +112,18 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: + /* + * CPU bugs mitigations mechanisms can call other functions. They + * should be invoked after making sure TF is cleared because + * single-step is ignored only for instructions inside the + * entry_SYSENTER_compat function. + */ + IBRS_ENTER + UNTRAIN_RET + CLEAR_BRANCH_HISTORY + movq %rsp, %rdi call do_SYSENTER_32 - /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ - "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -190,7 +193,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -209,16 +212,19 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 + +sysret32_from_system_call: /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV - /* Opportunistic SYSRET */ -sysret32_from_system_call: /* + * Opportunistic SYSRET + * * We are not going to return to userspace from the trampoline * stack. So let's erase the thread stack right now. */ @@ -271,6 +277,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -278,78 +285,15 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) SYM_CODE_END(entry_SYSCALL_compat) /* - * 32-bit legacy system call entry. - * - * 32-bit x86 Linux system calls traditionally used the INT $0x80 - * instruction. INT $0x80 lands here. - * - * This entry point can be used by 32-bit and 64-bit programs to perform - * 32-bit system calls. Instances of INT $0x80 can be found inline in - * various programs and libraries. It is also used by the vDSO's - * __kernel_vsyscall fallback for hardware that doesn't support a faster - * entry method. Restarted 32-bit system calls also fall back to INT - * $0x80 regardless of what instruction was originally used to do the - * system call. - * - * This is considered a slow path. It is not used by most libc - * implementations on modern hardware except during process startup. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. */ -SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_ENTRY - ENDBR - /* - * Interrupts are off on entry. - */ - ASM_CLAC /* Do this early to minimize exposure */ - ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV - - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - - /* switch to thread stack expects orig_ax and rdi to be pushed */ - pushq %rax /* pt_regs->orig_ax */ - - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax - - /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV - - movq %rsp, %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp - - pushq 5*8(%rax) /* regs->ss */ - pushq 4*8(%rax) /* regs->rsp */ - pushq 3*8(%rax) /* regs->eflags */ - pushq 2*8(%rax) /* regs->cs */ - pushq 1*8(%rax) /* regs->ip */ - pushq 0*8(%rax) /* regs->orig_ax */ -.Lint80_keep_stack: - - PUSH_AND_CLEAR_REGS rax=$-ENOSYS - UNWIND_HINT_REGS - - cld - - IBRS_ENTER - UNTRAIN_RET - - movq %rsp, %rdi - call do_int80_syscall_32 - jmp swapgs_restore_regs_and_return_to_usermode -SYM_CODE_END(entry_INT80_compat) +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 000000000000..894f7f16eb80 --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include <linux/export.h> +#include <linux/kvm_types.h> + +#include <asm/asm.h> +#include <asm/fred.h> +#include <asm/segment.h> + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) + FRED_EXIT +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + ANNOTATE_NOENDBR + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + push $__KERNEL_CS + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_callee=0 unwind_hint=0 + + movq %rsp, %rdi /* %rdi -> pt_regs */ + /* + * At this point: {rdi, rsi, rdx, rcx, r8, r9}, {r10, r11}, {rax, rdx} + * are clobbered, which corresponds to: arguments, extra caller-saved + * and return. All registers a C function is allowed to clobber. + * + * Notably, the callee-saved registers: {rbx, r12, r13, r14, r15} + * are untouched, with the exception of rbp, which carries the stack + * frame and will be restored before exit. + * + * Further calling another C function will not alter this state. + */ + call __fred_entry_from_kvm /* Call the C entry point */ + + /* + * When FRED, use ERETS to potentially clear NMIs, otherwise simply + * restore the stack pointer. + */ + ALTERNATIVE "nop; nop; mov %rbp, %rsp", \ + __stringify(add $C_PTREGS_SIZE, %rsp; ERETS), \ + X86_FEATURE_FRED + +1: /* + * Objtool doesn't understand ERETS, and the cfi register state is + * different from initial_func_cfi due to PUSH_REGS. Tell it the state + * is similar to where UNWIND_HINT_SAVE is. + */ + UNWIND_HINT_RESTORE + + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 000000000000..94e626cc6a07 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include <linux/kernel.h> +#include <linux/kdebug.h> +#include <linux/nospec.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/idtentry.h> +#include <asm/syscall.h> +#include <asm/trapnr.h> +#include <asm/traps.h> + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, error_code, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, error_code); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, error_code, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return fred_int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + + SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification), +}; + +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..a67a644d0cfe 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -1,10 +1,16 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for i386. */ +// SPDX-License-Identifier: GPL-2.0-only +/* 32-bit system call dispatch */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> +#include <linux/entry-common.h> +#include <linux/nospec.h> +#include <linux/uaccess.h> +#include <asm/apic.h> +#include <asm/traps.h> +#include <asm/cpufeature.h> #include <asm/syscall.h> #ifdef CONFIG_IA32_EMULATION @@ -14,12 +20,352 @@ #endif #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); - +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *); #include <asm/syscalls_32.h> -#undef __SYSCALL +#undef __SYSCALL -#define __SYSCALL(nr, sym) __ia32_##sym, +#undef __SYSCALL_NORETURN +#define __SYSCALL_NORETURN __SYSCALL -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 +#define __SYSCALL(nr, sym) __ia32_##sym, +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_32.h> + default: return __ia32_sys_ni_syscall(regs); + } +} + +static __always_inline int syscall_32_enter(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; + + return (int)regs->orig_ax; +} + +#ifdef CONFIG_IA32_EMULATION +bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); + +static int __init ia32_emulation_override_cmdline(char *arg) +{ + return kstrtobool(arg, &__ia32_enabled); +} +early_param("ia32_emulation", ia32_emulation_override_cmdline); +#endif + +/* + * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < IA32_NR_syscalls)) { + unr = array_index_nospec(unr, IA32_NR_syscalls); + regs->ax = ia32_sys_call(regs, unr); + } else if (nr != -1) { + regs->ax = __ia32_sys_ni_syscall(regs); + } +} + +#ifdef CONFIG_IA32_EMULATION +static __always_inline bool int80_is_external(void) +{ + const unsigned int offs = (0x80 / 32) * 0x10; + const u32 bit = BIT(0x80 % 32); + + /* The local APIC on XENPV guests is fake */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* + * If vector 0x80 is set in the APIC ISR then this is an external + * interrupt. Either from broken hardware or injected by a VMM. + * + * Note: In guest mode this is only valid for secure guests where + * the secure module fully controls the vAPIC exposed to the guest. + */ + return apic_read(APIC_ISR + offs) & bit; +} + +/** + * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * The arguments for the INT $0x80 based syscall are on stack in the + * pt_regs structure: + * eax: system call number + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 + */ +__visible noinstr void do_int80_emulation(struct pt_regs *regs) +{ + int nr; + + /* Kernel does not use INT $0x80! */ + if (unlikely(!user_mode(regs))) { + irqentry_enter(regs); + instrumentation_begin(); + panic("Unexpected external interrupt 0x80\n"); + } + + /* + * Establish kernel context for instrumentation, including for + * int80_is_external() below which calls into the APIC driver. + * Identical for soft and external interrupts. + */ + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* Validate that this is a soft interrupt to the extent possible */ + if (unlikely(int80_is_external())) + panic("Unexpected external interrupt 0x80\n"); + + /* + * The low level idtentry code pushed -1 into regs::orig_ax + * and regs::ax contains the syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif /* CONFIG_X86_FRED */ + +#else /* CONFIG_IA32_EMULATION */ + +/* Handles int $0x80 on a 32bit kernel */ +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) +{ + int nr = syscall_32_enter(regs); + + add_random_kstack_offset(); + /* + * Subtlety here: if ptrace pokes something larger than 2^31-1 into + * orig_ax, the int return value truncates it. This matches + * the semantics of syscall_get_nr(). + */ + nr = syscall_enter_from_user_mode(regs, nr); + instrumentation_begin(); + + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif /* !CONFIG_IA32_EMULATION */ + +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) +{ + int nr = syscall_32_enter(regs); + int res; + + add_random_kstack_offset(); + /* + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. + */ + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + /* Fetch EBP from where the vDSO stashed it. */ + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Micro-optimization: the pointer we're following is + * explicitly 32 bits, so it can't be out of range. + */ + res = __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } else { + res = get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } + + if (res) { + /* User code screwed up. */ + regs->ax = -EFAULT; + + local_irq_disable(); + instrumentation_end(); + irqentry_exit_to_user_mode(regs); + return false; + } + + nr = syscall_enter_from_user_mode_work(regs, nr); + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); + return true; +} + +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) + return false; + + /* + * Check that the register state is valid for using SYSRETL/SYSEXIT + * to exit to userspace. Otherwise use the slower but fully capable + * IRET exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* EIP must point to the VDSO landing pad */ + if (unlikely(regs->ip != landing_pad)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) + return false; + + /* If the TF, RF, or VM flags are set, use IRET */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) + return false; + + /* Use SYSRETL/SYSEXIT to exit to userspace */ + return true; +} + +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..b6e68ea98b83 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -1,18 +1,141 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for x86-64. */ +// SPDX-License-Identifier: GPL-2.0-only +/* 64-bit system call dispatch */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> +#include <linux/entry-common.h> +#include <linux/nospec.h> #include <asm/syscall.h> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); #include <asm/syscalls_64.h> -#undef __SYSCALL +#ifdef CONFIG_X86_X32_ABI +#include <asm/syscalls_x32.h> +#endif +#undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#undef __SYSCALL_NORETURN +#define __SYSCALL_NORETURN __SYSCALL -asmlinkage const sys_call_ptr_t sys_call_table[] = { +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#define __SYSCALL(nr, sym) __x64_##sym, +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_64.h> + default: return __x64_sys_ni_syscall(regs); + } +} + +#ifdef CONFIG_X86_X32_ABI +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } +} +#endif + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = x64_sys_call(regs, unr); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call(regs, xnr); + return true; + } + return false; +} + +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) +{ + add_random_kstack_offset(); + nr = syscall_enter_from_user_mode(regs, nr); + + instrumentation_begin(); + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ + regs->ax = __x64_sys_ni_syscall(regs); + } + + instrumentation_end(); + syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. + */ + if (unlikely(regs->ip >= TASK_SIZE_MAX)) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; +} diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c deleted file mode 100644 index bdd0e03a1265..000000000000 --- a/arch/x86/entry/syscall_x32.c +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for x32 ABI. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <linux/syscalls.h> -#include <asm/syscall.h> - -#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#include <asm/syscalls_x32.h> -#undef __SYSCALL - -#define __SYSCALL(nr, sym) __x64_##sym, - -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include <asm/syscalls_x32.h> -}; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2d0b1bd866ea..e979a3eac7a3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 32-bit system call numbers and entry vectors # # The format is: -# <number> <abi> <name> <entry point> <compat entry point> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]] # # The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for # sys_*() system calls and compat_sys_*() compat system calls if @@ -12,7 +13,7 @@ # The abi is always "i386" for this file. # 0 i386 restart_syscall sys_restart_syscall -1 i386 exit sys_exit +1 i386 exit sys_exit - noreturn 2 i386 fork sys_fork 3 i386 read sys_read 4 i386 write sys_write @@ -263,8 +264,8 @@ 249 i386 io_cancel sys_io_cancel 250 i386 fadvise64 sys_ia32_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group -253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +252 i386 exit_group sys_exit_group - noreturn +253 i386 lookup_dcookie 254 i386 epoll_create sys_epoll_create 255 i386 epoll_ctl sys_epoll_ctl 256 i386 epoll_wait sys_epoll_wait @@ -395,7 +396,7 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx -384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +384 i386 arch_prctl sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents 386 i386 rseq sys_rseq 393 i386 semget sys_semget @@ -420,7 +421,7 @@ 412 i386 utimensat_time64 sys_utimensat 413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 i386 io_pgetevents_time64 sys_io_pgetevents +416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 i386 mq_timedsend_time64 sys_mq_timedsend 419 i386 mq_timedreceive_time64 sys_mq_timedreceive @@ -457,3 +458,21 @@ 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 +453 i386 map_shadow_stack sys_map_shadow_stack +454 i386 futex_wake sys_futex_wake +455 i386 futex_wait sys_futex_wait +456 i386 futex_requeue sys_futex_requeue +457 i386 statmount sys_statmount +458 i386 listmount sys_listmount +459 i386 lsm_get_self_attr sys_lsm_get_self_attr +460 i386 lsm_set_self_attr sys_lsm_set_self_attr +461 i386 lsm_list_modules sys_lsm_list_modules +462 i386 mseal sys_mseal +463 i386 setxattrat sys_setxattrat +464 i386 getxattrat sys_getxattrat +465 i386 listxattrat sys_listxattrat +466 i386 removexattrat sys_removexattrat +467 i386 open_tree_attr sys_open_tree_attr +468 i386 file_getattr sys_file_getattr +469 i386 file_setattr sys_file_setattr +470 i386 listns sys_listns diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 814768249eae..8a4ac4841be6 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 64-bit system call numbers and entry vectors # # The format is: -# <number> <abi> <name> <entry point> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]] # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # @@ -68,7 +69,7 @@ 57 common fork sys_fork 58 common vfork sys_vfork 59 64 execve sys_execve -60 common exit sys_exit +60 common exit sys_exit - noreturn 61 common wait4 sys_wait4 62 common kill sys_kill 63 common uname sys_newuname @@ -220,7 +221,7 @@ 209 64 io_submit sys_io_submit 210 common io_cancel sys_io_cancel 211 64 get_thread_area -212 common lookup_dcookie sys_lookup_dcookie +212 common lookup_dcookie 213 common epoll_create sys_epoll_create 214 64 epoll_ctl_old 215 64 epoll_wait_old @@ -239,7 +240,7 @@ 228 common clock_gettime sys_clock_gettime 229 common clock_getres sys_clock_getres 230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group +231 common exit_group sys_exit_group - noreturn 232 common epoll_wait sys_epoll_wait 233 common epoll_ctl sys_epoll_ctl 234 common tgkill sys_tgkill @@ -343,6 +344,8 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal @@ -374,6 +377,24 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr +470 common listns sys_listns # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S index 27b5da2111ac..119ebdc3d362 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk.S @@ -4,43 +4,10 @@ * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. */ +#include <linux/export.h> #include <linux/linkage.h> #include "calling.h" #include <asm/asm.h> -#include <asm/export.h> - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func -SYM_FUNC_START(\name) - pushq %rbp - movq %rsp, %rbp - - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - call \func - - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - popq %rbp - RET -SYM_FUNC_END(\name) - _ASM_NOKPROBE(\name) - .endm THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S deleted file mode 100644 index ff6e7003da97..000000000000 --- a/arch/x86/entry/thunk_32.S +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) - * Copyright 2008 by Steven Rostedt, Red Hat, Inc - * (inspired by Andi Kleen's thunk_64.S) - */ - #include <linux/linkage.h> - #include <asm/asm.h> - #include <asm/export.h> - - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 -SYM_CODE_START_NOALIGN(\name) - pushl %eax - pushl %ecx - pushl %edx - - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif - - call \func - popl %edx - popl %ecx - popl %eax - RET - _ASM_NOKPROBE(\name) -SYM_CODE_END(\name) - .endm - - THUNK preempt_schedule_thunk, preempt_schedule - THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace - EXPORT_SYMBOL(preempt_schedule_thunk) - EXPORT_SYMBOL(preempt_schedule_notrace_thunk) - diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 6a1821bd7d5e..f247f5f5cb44 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,66 +3,36 @@ # Building vDSO images for x86. # -# Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +# Include the generic Makefile to check the built vDSO: +include $(srctree)/lib/vdso/Makefile.include -# Sanitizer runtimes are unavailable and cannot be linked here. -KASAN_SANITIZE := n -KMSAN_SANITIZE_vclock_gettime.o := n -KMSAN_SANITIZE_vgetcpu.o := n - -UBSAN_SANITIZE := n -KCSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y - -# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. -KCOV_INSTRUMENT := n - -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_IA32_EMULATION) := y - -# files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +# Files to link into the vDSO: +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o vobjs-$(CONFIG_X86_SGX) += vsgx.o -# files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n -OBJECT_FILES_NON_STANDARD_extable.o := n +# Files to link into the kernel: +obj-y += vma.o extable.o -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32 +# vDSO images to build: +obj-$(CONFIG_X86_64) += vdso-image-64.o +obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o +obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o -obj-$(VDSO32-y) += vdso32-setup.o - -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) -vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) +vobjs := $(addprefix $(obj)/, $(vobjs-y)) +vobjs32 := $(addprefix $(obj)/, $(vobjs32-y)) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) targets += vdso32/vdso32.lds $(vobjs32-y) -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) +targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \ -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE @@ -86,13 +56,13 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) CFL += $(RETPOLINE_VDSO_CFLAGS) endif endif -$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(KSTACK_ERASE_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO # @@ -103,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg CFLAGS_REMOVE_vsgx.o = -pg +CFLAGS_REMOVE_vgetrandom.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. @@ -122,7 +93,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ vobjx32s-y := $(vobjs-y:.o=-x32.o) # same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) +vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y)) # Convert 64bit object file to x32 for x32 vDSO. quiet_cmd_x32 = X32 $@ @@ -152,6 +123,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(KSTACK_ERASE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32)) @@ -162,8 +134,9 @@ KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) endif @@ -180,41 +153,10 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + -T $(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack -GCOV_PROFILE := n quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check) - -# -# Install the unstripped copies of vdso*.so. If our toolchain supports -# build-id, install .build-id links as well. -# -quiet_cmd_vdso_install = INSTALL $(@:install_%=%) -define cmd_vdso_install - cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ - if readelf -n $< |grep -q 'Build ID'; then \ - buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ - first=`echo $$buildid | cut -b-2`; \ - last=`echo $$buildid | cut -b3-`; \ - mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ - ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ - fi -endef - -vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) - -$(MODLIB)/vdso: FORCE - @mkdir -p $(MODLIB)/vdso - -$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso - $(call cmd,vdso_install) - -PHONY += vdso_install $(vdso_img_insttargets) -vdso_install: $(vdso_img_insttargets) - -clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh deleted file mode 100755 index 7ee90a9b549d..000000000000 --- a/arch/x86/entry/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h index b56f6b012941..baba612b832c 100644 --- a/arch/x86/entry/vdso/extable.h +++ b/arch/x86/entry/vdso/extable.h @@ -7,7 +7,7 @@ * vDSO uses a dedicated handler the addresses are relative to the overall * exception table, not each individual entry. */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ ASM_VDSO_EXTABLE_HANDLE from to diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 7d70935b6758..0debc194bd78 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -11,12 +11,10 @@ #include <linux/time.h> #include <linux/kernel.h> #include <linux/types.h> +#include <vdso/gettime.h> #include "../../../../lib/vdso/gettimeofday.c" -extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); -extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t); - int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); @@ -35,9 +33,6 @@ __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__v #if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64) /* both 64-bit and x32 use these */ -extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); -extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res); - int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { return __cvdso_clock_gettime(clock, ts); @@ -56,9 +51,6 @@ int clock_getres(clockid_t, struct __kernel_timespec *) #else /* i386 only */ -extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts); -extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res); - int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) { return __cvdso_clock_gettime32(clock, ts); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index bafa73f09e92..ec1ac191a057 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> +#include <vdso/datapage.h> /* * Linker script for vDSO. This is an ELF shared object prelinked to @@ -16,23 +18,11 @@ SECTIONS * segment. */ - vvar_start = . - 4 * PAGE_SIZE; - vvar_page = vvar_start; + VDSO_VVAR_SYMS - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR - - pvclock_page = vvar_start + PAGE_SIZE; - hvclock_page = vvar_start + 2 * PAGE_SIZE; - timens_page = vvar_start + 3 * PAGE_SIZE; - -#undef _ASM_X86_VVAR_H - /* Place all vvars in timens too at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR + vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data); + pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE; + hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index e8c60ae7a7c8..0bab5f4af6d1 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -30,6 +30,8 @@ VERSION { #ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; #endif + getrandom; + __vdso_getrandom; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 90d15f2a7205..f84e8f8fa5fe 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -69,33 +69,12 @@ const char *outfilename; -/* Symbols that we need in vdso2c. */ -enum { - sym_vvar_start, - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - -const int special_pages[] = { - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - struct vdso_sym { const char *name; bool export; }; struct vdso_sym required_syms[] = { - [sym_vvar_start] = {"vvar_start", true}, - [sym_vvar_page] = {"vvar_page", true}, - [sym_pvclock_page] = {"pvclock_page", true}, - [sym_hvclock_page] = {"hvclock_page", true}, - [sym_timens_page] = {"timens_page", true}, {"VDSO32_NOTE_MASK", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 67b3e37576a6..78ed1c1f28b9 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,26 +150,6 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } } - /* Validate mapping addresses. */ - for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - INT_BITS symval = syms[special_pages[i]]; - - if (!symval) - continue; /* The mapping isn't used; ignore it. */ - - if (symval % 4096) - fail("%s must be a multiple of 4096\n", - required_syms[i].name); - if (symval + 4096 < syms[sym_vvar_start]) - fail("%s underruns vvar_start\n", - required_syms[i].name); - if (symval + 4096 > 0) - fail("%s is on the wrong side of the vdso text\n", - required_syms[i].name); - } - if (syms[sym_vvar_start] % 4096) - fail("vvar_begin must be a multiple of 4096\n"); - if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index f3b3cacbcbb0..8894013eea1d 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -51,15 +51,17 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -#ifdef CONFIG_X86_64 #ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> -static struct ctl_table abi_table2[] = { +static const struct ctl_table vdso_table[] = { { +#ifdef CONFIG_X86_64 .procname = "vsyscall32", +#else + .procname = "vdso_enabled", +#endif .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, @@ -67,15 +69,18 @@ static struct ctl_table abi_table2[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static __init int ia32_binfmt_init(void) { - register_sysctl("abi", abi_table2); +#ifdef CONFIG_X86_64 + /* Register vsyscall32 into the ABI table */ + register_sysctl("abi", vdso_table); +#else + register_sysctl_init("vm", vdso_table); +#endif return 0; } __initcall(ia32_binfmt_init); #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..bcba5639b8ee --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +.section .rodata, "a" +.align 16 +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 +.text + +/* + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number + * of blocks of output with a nonce of 0, taking an input key and 8-byte + * counter. Importantly does not spill to the stack. Its arguments are: + * + * rdi: output bytes + * rsi: 32-byte key input + * rdx: 8-byte counter input/output + * rcx: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +.set output, %rdi +.set key, %rsi +.set counter, %rdx +.set nblocks, %rcx +.set i, %al +/* xmm registers are *not* callee-save. */ +.set temp, %xmm0 +.set state0, %xmm1 +.set state1, %xmm2 +.set state2, %xmm3 +.set state3, %xmm4 +.set copy0, %xmm5 +.set copy1, %xmm6 +.set copy2, %xmm7 +.set copy3, %xmm8 +.set one, %xmm9 + + /* copy0 = "expand 32-byte k" */ + movaps CONSTANTS(%rip),copy0 + /* copy1,copy2 = key */ + movups 0x00(key),copy1 + movups 0x10(key),copy2 + /* copy3 = counter || zero nonce */ + movq 0x00(counter),copy3 + /* one = 1 || 0 */ + movq $1,%rax + movq %rax,one + +.Lblock: + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ + movdqa copy0,state0 + movdqa copy1,state1 + movdqa copy2,state2 + movdqa copy3,state3 + + movb $10,i +.Lpermute: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + pshufd $0x39,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[3,0,1,2] */ + pshufd $0x93,state3,state3 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + pshufd $0x93,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + pshufd $0x39,state3,state3 + + decb i + jnz .Lpermute + + /* output0 = state0 + copy0 */ + paddd copy0,state0 + movups state0,0x00(output) + /* output1 = state1 + copy1 */ + paddd copy1,state1 + movups state1,0x10(output) + /* output2 = state2 + copy2 */ + paddd copy2,state2 + movups state2,0x20(output) + /* output3 = state3 + copy3 */ + paddd copy3,state3 + movups state3,0x30(output) + + /* ++copy3.counter */ + paddq one,copy3 + + /* output += 64, --nblocks */ + addq $64,output + decq nblocks + jnz .Lblock + + /* counter = copy3.counter */ + movq copy3,0x00(counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + pxor state0,state0 + pxor state1,state1 + pxor state2,state2 + pxor state3,state3 + pxor copy1,copy1 + pxor copy2,copy2 + pxor temp,temp + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c new file mode 100644 index 000000000000..430862b8977c --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#include <linux/types.h> + +#include "../../../../lib/vdso/getrandom.c" + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} + +ssize_t getrandom(void *, size_t, unsigned int, void *, size_t) + __attribute__((weak, alias("__vdso_getrandom"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7645730dc228..afe105b2f907 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -14,29 +14,20 @@ #include <linux/elf.h> #include <linux/cpu.h> #include <linux/ptrace.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <asm/pvclock.h> #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> -#include <asm/vvar.h> #include <asm/tlb.h> #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> +#include <asm/vdso/vsyscall.h> #include <clocksource/hyperv_timer.h> -#undef _ASM_X86_VVAR_H -#define EMIT_VVAR(name, offset) \ - const size_t name ## _offset = offset; -#include <asm/vvar.h> - -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page + _vdso_data_offset); -} -#undef EMIT_VVAR +static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES); unsigned int vclocks_used __read_mostly; @@ -56,7 +47,6 @@ int __init init_vdso_image(const struct vdso_image *image) return 0; } -static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -75,7 +65,6 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, static void vdso_fix_landing(const struct vdso_image *image, struct vm_area_struct *new_vma) { -#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION if (in_ia32_syscall() && image == &vdso_image_32) { struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; @@ -86,7 +75,6 @@ static void vdso_fix_landing(const struct vdso_image *image, if (regs->ip == old_land_addr) regs->ip = new_vma->vm_start + vdso_land; } -#endif } static int vdso_mremap(const struct vm_special_mapping *sm, @@ -100,106 +88,32 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -#ifdef CONFIG_TIME_NS -/* - * The vvar page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_mapping)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) +static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) { - const struct vdso_image *image = vma->vm_mm->context.vdso_image; - unsigned long pfn; - long sym_offset; - - if (!image) - return VM_FAULT_SIGBUS; - - sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + - image->sym_vvar_start; - - /* - * Sanity check: a symbol offset of zero means that the page - * does not exist for this vdso image, not that the page is at - * offset zero relative to the text mapping. This should be - * impossible here, because sym_offset should only be zero for - * the page past the end of the vvar mapping. - */ - if (sym_offset == 0) - return VM_FAULT_SIGBUS; - - if (sym_offset == image->sym_vvar_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; - - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the sym_vvar_page offset and - * the real VVAR page is mapped with the sym_timens_page - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (timens_page) { - unsigned long addr; - vm_fault_t err; - - /* - * Optimization: inside time namespace pre-fault - * VVAR page too. As on timens page there are only - * offsets for clocks on VVAR, it'll be faulted - * shortly by VDSO code. - */ - addr = vmf->address + (image->sym_timens_page - sym_offset); - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - - pfn = page_to_pfn(timens_page); - } - - return vmf_insert_pfn(vma, vmf->address, pfn); - } else if (sym_offset == image->sym_pvclock_page) { + switch (vmf->pgoff) { +#ifdef CONFIG_PARAVIRT_CLOCK + case VDSO_PAGE_PVCLOCK_OFFSET: + { struct pvclock_vsyscall_time_info *pvti = pvclock_get_pvti_cpu0_va(); - if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { + + if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) return vmf_insert_pfn_prot(vma, vmf->address, __pa(pvti) >> PAGE_SHIFT, pgprot_decrypted(vma->vm_page_prot)); - } - } else if (sym_offset == image->sym_hvclock_page) { - pfn = hv_get_tsc_pfn(); - + break; + } +#endif /* CONFIG_PARAVIRT_CLOCK */ +#ifdef CONFIG_HYPERV_TIMER + case VDSO_PAGE_HVCLOCK_OFFSET: + { + unsigned long pfn = hv_get_tsc_pfn(); if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, pfn); - } else if (sym_offset == image->sym_timens_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - if (!timens_page) - return VM_FAULT_SIGBUS; - - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; - return vmf_insert_pfn(vma, vmf->address, pfn); + break; + } +#endif /* CONFIG_HYPERV_TIMER */ } return VM_FAULT_SIGBUS; @@ -210,9 +124,9 @@ static const struct vm_special_mapping vdso_mapping = { .fault = vdso_fault, .mremap = vdso_mremap, }; -static const struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, +static const struct vm_special_mapping vvar_vclock_mapping = { + .name = "[vvar_vclock]", + .fault = vvar_vclock_fault, }; /* @@ -231,13 +145,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) return -EINTR; addr = get_unmapped_area(NULL, addr, - image->size - image->sym_vvar_start, 0, 0); + image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - text_start = addr - image->sym_vvar_start; + text_start = addr + __VDSO_PAGES * PAGE_SIZE; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -246,7 +160,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) text_start, image->size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_SEALED_SYSMAP, &vdso_mapping); if (IS_ERR(vma)) { @@ -254,79 +169,35 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) goto up_fail; } + vma = vdso_install_vvar_mapping(mm, addr); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + do_munmap(mm, text_start, image->size, NULL); + goto up_fail; + } + vma = _install_special_mapping(mm, - addr, - -image->sym_vvar_start, + VDSO_VCLOCK_PAGES_START(addr), + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + VM_PFNMAP|VM_SEALED_SYSMAP, + &vvar_vclock_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); - } else { - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; + do_munmap(mm, addr, image->size, NULL); + goto up_fail; } + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; + up_fail: mmap_write_unlock(mm); return ret; } -#ifdef CONFIG_X86_64 -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= DEFAULT_MAP_WINDOW) - end = DEFAULT_MAP_WINDOW; - end -= len; - - if (end > start) { - offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -} - -static int map_vdso_randomized(const struct vdso_image *image) -{ - unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); - - return map_vdso(image, addr); -} -#endif - int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; @@ -343,7 +214,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || - vma_is_special_mapping(vma, &vvar_mapping)) { + vma_is_special_mapping(vma, &vdso_vvar_mapping) || + vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; } @@ -353,7 +225,6 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) return map_vdso(image, addr); } -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ @@ -361,45 +232,38 @@ static int load_vdso32(void) return map_vdso(&vdso_image_32, 0); } -#endif -#ifdef CONFIG_X86_64 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - if (!vdso64_enabled) - return 0; + if (IS_ENABLED(CONFIG_X86_64)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, 0); + } - return map_vdso_randomized(&vdso_image_64); + return load_vdso32(); } #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, bool x32) { -#ifdef CONFIG_X86_X32_ABI - if (x32) { + if (IS_ENABLED(CONFIG_X86_X32_ABI) && x32) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_x32); + return map_vdso(&vdso_image_x32, 0); } -#endif -#ifdef CONFIG_IA32_EMULATION - return load_vdso32(); -#else + + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + return load_vdso32(); + return 0; -#endif -} -#endif -#else -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - return load_vdso32(); } #endif bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) const struct vdso_image *image = current->mm->context.vdso_image; unsigned long vdso = (unsigned long) current->mm->context.vdso; @@ -408,7 +272,6 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) return true; } -#endif return false; } diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index d77d278ee9dd..37a3d4c02366 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> -#include <asm/export.h> #include <asm/errno.h> #include <asm/enclu.h> diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea8..6e6c0a740837 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) static bool write_ok_or_segv(unsigned long ptr, size_t size) { - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_err, this could go away. - */ - if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; @@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { - struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; @@ -131,7 +124,12 @@ bool emulate_vsyscall(unsigned long error_code, if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; - if (!(error_code & X86_PF_INSTR)) { + /* + * Assume that faults at regs->ip are because of an + * instruction fetch. Return early and avoid + * emulation for faults during data accesses: + */ + if (address != regs->ip) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; @@ -144,12 +142,18 @@ bool emulate_vsyscall(unsigned long error_code, } /* + * X86_PF_INSTR is only set when NX is supported. When + * available, use it to double-check that the emulation code + * is only being used for instruction fetches: + */ + if (cpu_feature_enabled(X86_FEATURE_NX)) + WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); + + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ - WARN_ON_ONCE(address != regs->ip); - if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); @@ -172,8 +176,6 @@ bool emulate_vsyscall(unsigned long error_code, goto sigsegv; } - tsk = current; - /* * Check for access_ok violations and find the syscall nr. * @@ -234,12 +236,8 @@ bool emulate_vsyscall(unsigned long error_code, goto do_ret; /* skip requested */ /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. + * With a real vsyscall, page faults cause SIGSEGV. */ - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; - current->thread.sig_on_uaccess_err = 1; - ret = -EFAULT; switch (vsyscall_nr) { case 0: @@ -262,23 +260,12 @@ bool emulate_vsyscall(unsigned long error_code, break; } - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; - check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + goto sigsegv; } regs->ax = ret; @@ -365,9 +352,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); -#if CONFIG_PGTABLE_LEVELS >= 5 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); -#endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); |
