diff options
Diffstat (limited to 'arch/x86/entry')
25 files changed, 1165 insertions, 348 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..ce1cc1622385 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -17,7 +17,10 @@ obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +obj-$(CONFIG_PREEMPTION) += thunk.o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f6907627172b..ea81770629ee 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -87,14 +87,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -109,7 +112,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -117,9 +122,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 @@ -142,10 +147,10 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ #define PTI_USER_PGTABLE_BIT PAGE_SHIFT @@ -160,7 +165,7 @@ For 32-bit we have the following conventions - kernel is built with .macro ADJUST_KERNEL_CR3 reg:req ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID - /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg .endm @@ -173,10 +178,9 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) -.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req - ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI +.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req mov %cr3, \scratch_reg ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID @@ -206,13 +210,20 @@ For 32-bit we have the following conventions - kernel is built with /* Flip the PGD to the user version */ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg mov \scratch_reg, %cr3 +.endm + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2 .Lend_\@: .endm .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI pushq %rax - SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax popq %rax +.Lend_\@: .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req @@ -233,17 +244,19 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +/* Restore CR3 from a kernel context. May restore a user CR3 value. */ +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI - ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID - /* - * KERNEL pages can always resume with NOFLUSH as we do - * explicit flushes. + * If CR3 contained the kernel page tables at the paranoid exception + * entry, then there is nothing to restore as CR3 is not modified while + * handling the exception. */ bt $PTI_USER_PGTABLE_BIT, \save_reg - jnc .Lnoflush_\@ + jnc .Lend_\@ + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID /* * Check if there's a pending flush for the user ASID we're @@ -251,25 +264,17 @@ For 32-bit we have the following conventions - kernel is built with */ movq \save_reg, \scratch_reg andq $(0x7FF), \scratch_reg - bt \scratch_reg, THIS_CPU_user_pcid_flush_mask - jnc .Lnoflush_\@ - btr \scratch_reg, THIS_CPU_user_pcid_flush_mask - jmp .Lwrcr3_\@ + jc .Lwrcr3_\@ -.Lnoflush_\@: SET_NOFLUSH_BIT \save_reg .Lwrcr3_\@: - /* - * The CR3 write could be avoided when not changing its value, - * but would require a CR3 read *and* a scratch register. - */ movq \save_reg, %cr3 .Lend_\@: .endm -#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */ .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req .endm @@ -279,7 +284,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 scratch_reg:req save_reg:req +.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif @@ -297,7 +302,7 @@ For 32-bit we have the following conventions - kernel is built with * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. */ .macro IBRS_ENTER save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -326,7 +331,7 @@ For 32-bit we have the following conventions - kernel is built with * regs. Must be called after the last RET. */ .macro IBRS_EXIT save_reg -#ifdef CONFIG_CPU_IBRS_ENTRY +#ifdef CONFIG_MITIGATION_IBRS_ENTRY ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS movl $MSR_IA32_SPEC_CTRL, %ecx @@ -420,3 +425,63 @@ For 32-bit we have the following conventions - kernel is built with .endm #endif /* CONFIG_SMP */ + +#ifdef CONFIG_X86_64 + +/* rdi: arg1 ... normal C conventions. rax is saved/restored. */ +.macro THUNK name, func +SYM_FUNC_START(\name) + pushq %rbp + movq %rsp, %rbp + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + call \func + + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rbp + RET +SYM_FUNC_END(\name) + _ASM_NOKPROBE(\name) +.endm + +#else /* CONFIG_X86_32 */ + +/* put return address in eax (arg1) */ +.macro THUNK name, func, put_ret_addr_in_eax=0 +SYM_CODE_START_NOALIGN(\name) + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + RET + _ASM_NOKPROBE(\name) +SYM_CODE_END(\name) + .endm + +#endif diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 6356060caaf3..14db5b85114c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); - regs->ax = sys_call_table[unr](regs); + regs->ax = x64_sys_call(regs, unr); return true; } return false; @@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call_table[xnr](regs); + regs->ax = x32_sys_call(regs, xnr); return true; } return false; @@ -150,7 +150,7 @@ early_param("ia32_emulation", ia32_emulation_override_cmdline); #endif /* - * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) { @@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) if (likely(unr < IA32_NR_syscalls)) { unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[unr](regs); + regs->ax = ia32_sys_call(regs, unr); } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } @@ -189,7 +189,8 @@ static __always_inline bool int80_is_external(void) } /** - * int80_emulation - 32-bit legacy syscall entry + * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in @@ -207,7 +208,7 @@ static __always_inline bool int80_is_external(void) * eax: system call number * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 */ -DEFINE_IDTENTRY_RAW(int80_emulation) +__visible noinstr void do_int80_emulation(struct pt_regs *regs) { int nr; @@ -255,6 +256,71 @@ DEFINE_IDTENTRY_RAW(int80_emulation) instrumentation_end(); syscall_exit_to_user_mode(regs); } + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif #else /* CONFIG_IA32_EMULATION */ /* Handles int $0x80 on a 32bit kernel */ diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8c8d38f0cb1d..b7ea3e8e9ecc 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -6,6 +6,13 @@ #include <linux/export.h> #include <linux/linkage.h> #include <asm/msr-index.h> +#include <asm/unwind_hints.h> +#include <asm/segment.h> +#include <asm/cache.h> +#include <asm/cpufeatures.h> +#include <asm/nospec-branch.h> + +#include "calling.h" .pushsection .noinstr.text, "ax" @@ -14,9 +21,49 @@ SYM_FUNC_START(entry_ibpb) movl $PRED_CMD_IBPB, %eax xorl %edx, %edx wrmsr + + /* Make sure IBPB clears return stack preductions too. */ + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET SYM_FUNC_END(entry_ibpb) /* For KVM */ EXPORT_SYMBOL_GPL(entry_ibpb); .popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensure VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(mds_verw_sel) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(mds_verw_sel); +/* For KVM */ +EXPORT_SYMBOL_GPL(mds_verw_sel); + +.popsection + +THUNK warn_thunk_thunk, __warn_thunk + +#ifndef CONFIG_X86_64 +/* + * Clang's implementation of TLS stack cookies requires the variable in + * question to be a TLS variable. If the variable happens to be defined as an + * ordinary variable with external linkage in the same compilation unit (which + * amounts to the whole of vmlinux with LTO enabled), Clang will drop the + * segment register prefix from the references, resulting in broken code. Work + * around this by avoiding the symbol used in -mstack-protector-guard-symbol= + * entirely in the C code, and use an alias emitted by the linker script + * instead. + */ +#ifdef CONFIG_STACKPROTECTOR +EXPORT_SYMBOL(__ref_stack_chk_guard); +#endif +#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c73047bf9f4b..20be5758c2d2 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -305,7 +305,7 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET) ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers @@ -875,6 +871,8 @@ SYM_FUNC_START(entry_SYSENTER_32) /* Now ready to switch the cr3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax + /* Clobbers ZF */ + CLEAR_CPU_BUFFERS /* * Restore all flags except IF. (We restore IF separately because @@ -954,6 +952,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1166,6 +1165,7 @@ SYM_CODE_START(asm_exc_nmi) CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 + CLEAR_CPU_BUFFERS jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1207,6 +1207,7 @@ SYM_CODE_START(asm_exc_nmi) * 1 - orig_ax */ lss (1+5+6)*4(%esp), %esp # back to espfix stack + CLEAR_CPU_BUFFERS jmp .Lirq_return #endif SYM_CODE_END(asm_exc_nmi) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index de6469dffe3a..f52dbe0ad93c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ @@ -161,6 +162,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -190,7 +192,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary + movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) #endif /* @@ -247,7 +249,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection @@ -300,10 +308,9 @@ SYM_CODE_END(xen_error_entry) movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .endif - call \cfunc - /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp error_return .endm @@ -370,14 +377,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number @@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer into first argument */ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - call \cfunc /* For some configurations \cfunc ends up being a noreturn. */ - REACHABLE + ANNOTATE_REACHABLE + call \cfunc jmp paranoid_exit @@ -559,17 +558,28 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) IBRS_EXIT -#ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ - testb $3, CS(%rsp) - jnz 1f - ud2 -1: -#endif #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI +#endif + + STACKLEAK_ERASE + POP_REGS + add $8, %rsp /* orig_ax */ + UNWIND_HINT_IRET_REGS + +.Lswapgs_and_iret: + swapgs + CLEAR_CPU_BUFFERS + /* Assert that the IRET frame indicates user mode. */ + testb $3, 8(%rsp) + jnz .Lnative_iret + ud2 +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +.Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 /* @@ -596,13 +606,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + push %rax + SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax + pop %rax /* Restore RDI. */ popq %rdi - swapgs - jmp .Lnative_iret - + jmp .Lswapgs_and_iret +#endif SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY @@ -712,6 +723,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -957,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit) IBRS_EXIT save_reg=%r15 /* - * The order of operations is important. RESTORE_CR3 requires + * The order of operations is important. PARANOID_RESTORE_CR3 requires * kernel GSBASE. * * NB to anyone to try to optimize this code: this code does * not execute at all for exceptions from user mode. Those * exceptions go through error_return instead. */ - RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14 /* Handle the three GSBASE cases */ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE @@ -1085,7 +1098,7 @@ SYM_CODE_END(error_return) * * Registers: * %r14: Used to save/restore the CR3 of the interrupted context - * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber. */ SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_ENTRY @@ -1393,8 +1406,7 @@ end_repeat_nmi: /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ IBRS_EXIT save_reg=%r15 - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 /* * The above invocation of paranoid_entry stored the GSBASE @@ -1439,6 +1451,12 @@ nmi_restore: movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this * cannot result in a fault. Similarly, we don't need to worry @@ -1455,6 +1473,7 @@ SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl SYM_CODE_END(entry_SYSCALL32_ignore) @@ -1472,3 +1491,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) call make_task_dead SYM_CODE_END(rewind_stack_and_make_dead) .popsection + +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. + */ +SYM_FUNC_START(clear_bhb_loop) + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f + RET + .align 64, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b + RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +EXPORT_SYMBOL_GPL(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index de94e2e84ecc..ed0a5f2dc129 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -7,7 +7,6 @@ #include <asm/asm-offsets.h> #include <asm/current.h> #include <asm/errno.h> -#include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> #include <asm/irqflags.h> @@ -90,9 +89,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) cld - IBRS_ENTER - UNTRAIN_RET - /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -116,6 +112,16 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: + /* + * CPU bugs mitigations mechanisms can call other functions. They + * should be invoked after making sure TF is cleared because + * single-step is ignored only for instructions inside the + * entry_SYSENTER_compat function. + */ + IBRS_ENTER + UNTRAIN_RET + CLEAR_BRANCH_HISTORY + movq %rsp, %rdi call do_SYSENTER_32 jmp sysret32_from_system_call @@ -206,6 +212,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) IBRS_ENTER UNTRAIN_RET + CLEAR_BRANCH_HISTORY movq %rsp, %rdi call do_fast_syscall_32 @@ -270,8 +277,23 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_compat) + +/* + * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries + * point to C routines, however since this is a system call interface the branch + * history needs to be scrubbed to protect against BHI attacks, and that + * scrubbing needs to take place in assembly code prior to entering any C + * routines. + */ +SYM_CODE_START(int80_emulation) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + CLEAR_BRANCH_HISTORY + jmp do_int80_emulation +SYM_CODE_END(int80_emulation) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 000000000000..a02bc6f3d2e6 --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include <linux/export.h> + +#include <asm/asm.h> +#include <asm/fred.h> +#include <asm/segment.h> + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) + FRED_EXIT +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 000000000000..f004a4dc74c2 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include <linux/kernel.h> +#include <linux/kdebug.h> +#include <linux/nospec.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/idtentry.h> +#include <asm/syscall.h> +#include <asm/trapnr.h> +#include <asm/traps.h> + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, error_code, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, error_code); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, error_code, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return fred_int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + + SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification), +}; + +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cfc9bc73e7f..8cc9950d7104 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -14,12 +14,31 @@ #endif #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); - +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *); #include <asm/syscalls_32.h> -#undef __SYSCALL +#undef __SYSCALL -#define __SYSCALL(nr, sym) __ia32_##sym, +#undef __SYSCALL_NORETURN +#define __SYSCALL_NORETURN __SYSCALL -__visible const sys_call_ptr_t ia32_sys_call_table[] = { +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#ifdef CONFIG_X86_32 +#define __SYSCALL(nr, sym) __ia32_##sym, +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL +#endif + +#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); +long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_32.h> + default: return __ia32_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index be120eec1fc9..ba8354424860 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -8,11 +8,29 @@ #include <asm/syscall.h> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); #include <asm/syscalls_64.h> -#undef __SYSCALL +#undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#undef __SYSCALL_NORETURN +#define __SYSCALL_NORETURN __SYSCALL -asmlinkage const sys_call_ptr_t sys_call_table[] = { +/* + * The sys_call_table[] is no longer used for system calls, but + * kernel/trace/trace_syscalls.c still wants to know the system + * call address. + */ +#define __SYSCALL(nr, sym) __x64_##sym, +const sys_call_ptr_t sys_call_table[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL + +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); +long x64_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_64.h> + default: return __x64_sys_ni_syscall(regs); + } +}; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index bdd0e03a1265..fb77908f44f3 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,11 +8,18 @@ #include <asm/syscall.h> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); #include <asm/syscalls_x32.h> -#undef __SYSCALL +#undef __SYSCALL -#define __SYSCALL(nr, sym) __x64_##sym, +#undef __SYSCALL_NORETURN +#define __SYSCALL_NORETURN __SYSCALL -asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { -#include <asm/syscalls_x32.h> +#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } }; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c8fac5205803..4d0fb2fba7e2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 32-bit system call numbers and entry vectors # # The format is: -# <number> <abi> <name> <entry point> <compat entry point> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]] # # The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for # sys_*() system calls and compat_sys_*() compat system calls if @@ -12,7 +13,7 @@ # The abi is always "i386" for this file. # 0 i386 restart_syscall sys_restart_syscall -1 i386 exit sys_exit +1 i386 exit sys_exit - noreturn 2 i386 fork sys_fork 3 i386 read sys_read 4 i386 write sys_write @@ -263,7 +264,7 @@ 249 i386 io_cancel sys_io_cancel 250 i386 fadvise64 sys_ia32_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) -252 i386 exit_group sys_exit_group +252 i386 exit_group sys_exit_group - noreturn 253 i386 lookup_dcookie 254 i386 epoll_create sys_epoll_create 255 i386 epoll_ctl sys_epoll_ctl @@ -420,7 +421,7 @@ 412 i386 utimensat_time64 sys_utimensat 413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64 414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64 -416 i386 io_pgetevents_time64 sys_io_pgetevents +416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64 417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64 418 i386 mq_timedsend_time64 sys_mq_timedsend 419 i386 mq_timedreceive_time64 sys_mq_timedreceive @@ -461,3 +462,13 @@ 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait 456 i386 futex_requeue sys_futex_requeue +457 i386 statmount sys_statmount +458 i386 listmount sys_listmount +459 i386 lsm_get_self_attr sys_lsm_get_self_attr +460 i386 lsm_set_self_attr sys_lsm_set_self_attr +461 i386 lsm_list_modules sys_lsm_list_modules +462 i386 mseal sys_mseal +463 i386 setxattrat sys_setxattrat +464 i386 getxattrat sys_getxattrat +465 i386 listxattrat sys_listxattrat +466 i386 removexattrat sys_removexattrat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 8cb8bf68721c..5eb708bff1c7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note # # 64-bit system call numbers and entry vectors # # The format is: -# <number> <abi> <name> <entry point> +# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]] # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # @@ -68,7 +69,7 @@ 57 common fork sys_fork 58 common vfork sys_vfork 59 64 execve sys_execve -60 common exit sys_exit +60 common exit sys_exit - noreturn 61 common wait4 sys_wait4 62 common kill sys_kill 63 common uname sys_newuname @@ -239,7 +240,7 @@ 228 common clock_gettime sys_clock_gettime 229 common clock_getres sys_clock_getres 230 common clock_nanosleep sys_clock_nanosleep -231 common exit_group sys_exit_group +231 common exit_group sys_exit_group - noreturn 232 common epoll_wait sys_epoll_wait 233 common epoll_ctl sys_epoll_ctl 234 common tgkill sys_tgkill @@ -343,6 +344,7 @@ 332 common statx sys_statx 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq +335 common uretprobe sys_uretprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal @@ -374,10 +376,20 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 -453 64 map_shadow_stack sys_map_shadow_stack +453 common map_shadow_stack sys_map_shadow_stack 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S index 416b400f39db..119ebdc3d362 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk.S @@ -9,39 +9,6 @@ #include "calling.h" #include <asm/asm.h> - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func -SYM_FUNC_START(\name) - pushq %rbp - movq %rsp, %rbp - - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - call \func - - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - popq %rbp - RET -SYM_FUNC_END(\name) - _ASM_NOKPROBE(\name) - .endm - THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace EXPORT_SYMBOL(preempt_schedule_thunk) diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S deleted file mode 100644 index 0103e103a657..000000000000 --- a/arch/x86/entry/thunk_32.S +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) - * Copyright 2008 by Steven Rostedt, Red Hat, Inc - * (inspired by Andi Kleen's thunk_64.S) - */ - #include <linux/export.h> - #include <linux/linkage.h> - #include <asm/asm.h> - - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 -SYM_CODE_START_NOALIGN(\name) - pushl %eax - pushl %ecx - pushl %edx - - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif - - call \func - popl %edx - popl %ecx - popl %eax - RET - _ASM_NOKPROBE(\name) -SYM_CODE_END(\name) - .endm - - THUNK preempt_schedule_thunk, preempt_schedule - THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace - EXPORT_SYMBOL(preempt_schedule_thunk) - EXPORT_SYMBOL(preempt_schedule_notrace_thunk) - diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index b1b8dd1608f7..c9216ac4fb1e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,63 +3,32 @@ # Building vDSO images for x86. # -# Include the generic Makefile to check the built vdso. +# Include the generic Makefile to check the built vDSO: include $(srctree)/lib/vdso/Makefile -# Sanitizer runtimes are unavailable and cannot be linked here. -KASAN_SANITIZE := n -KMSAN_SANITIZE_vclock_gettime.o := n -KMSAN_SANITIZE_vgetcpu.o := n - -UBSAN_SANITIZE := n -KCSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y - -# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. -KCOV_INSTRUMENT := n - -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_IA32_EMULATION) := y - -# files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +# Files to link into the vDSO: +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o vobjs-$(CONFIG_X86_SGX) += vsgx.o -# files to link into kernel -obj-y += vma.o extable.o -KASAN_SANITIZE_vma.o := y -UBSAN_SANITIZE_vma.o := y -KCSAN_SANITIZE_vma.o := y -OBJECT_FILES_NON_STANDARD_vma.o := n -OBJECT_FILES_NON_STANDARD_extable.o := n - -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32 +# Files to link into the kernel: +obj-y += vma.o extable.o -obj-$(VDSO32-y) += vdso32-setup.o -OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n +# vDSO images to build: +obj-$(CONFIG_X86_64) += vdso-image-64.o +obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o +obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) -vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) +vobjs := $(addprefix $(obj)/, $(vobjs-y)) +vobjs32 := $(addprefix $(obj)/, $(vobjs32-y)) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) targets += vdso32/vdso32.lds $(vobjs32-y) -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) +targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C @@ -87,7 +56,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) CFL += $(RETPOLINE_VDSO_CFLAGS) endif @@ -104,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg CFLAGS_REMOVE_vsgx.o = -pg +CFLAGS_REMOVE_vgetrandom.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. @@ -123,7 +93,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ vobjx32s-y := $(vobjs-y:.o=-x32.o) # same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) +vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y)) # Convert 64bit object file to x32 for x32 vDSO. quiet_cmd_x32 = X32 $@ @@ -164,7 +134,7 @@ KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) endif @@ -182,13 +152,10 @@ quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + sh $(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack -GCOV_PROFILE := n quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check) - -clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 7d70935b6758..0debc194bd78 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -11,12 +11,10 @@ #include <linux/time.h> #include <linux/kernel.h> #include <linux/types.h> +#include <vdso/gettime.h> #include "../../../../lib/vdso/gettimeofday.c" -extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); -extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t); - int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); @@ -35,9 +33,6 @@ __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__v #if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64) /* both 64-bit and x32 use these */ -extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); -extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res); - int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { return __cvdso_clock_gettime(clock, ts); @@ -56,9 +51,6 @@ int clock_getres(clockid_t, struct __kernel_timespec *) #else /* i386 only */ -extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts); -extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res); - int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) { return __cvdso_clock_gettime32(clock, ts); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index bafa73f09e92..872947c1004c 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/vdso.h> +#include <asm/vdso/vsyscall.h> /* * Linker script for vDSO. This is an ELF shared object prelinked to @@ -16,23 +17,16 @@ SECTIONS * segment. */ - vvar_start = . - 4 * PAGE_SIZE; + vvar_start = . - __VVAR_PAGES * PAGE_SIZE; vvar_page = vvar_start; - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR + vdso_rng_data = vvar_page + __VDSO_RND_DATA_OFFSET; - pvclock_page = vvar_start + PAGE_SIZE; - hvclock_page = vvar_start + 2 * PAGE_SIZE; - timens_page = vvar_start + 3 * PAGE_SIZE; + timens_page = vvar_start + PAGE_SIZE; -#undef _ASM_X86_VVAR_H - /* Place all vvars in timens too at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset; -#include <asm/vvar.h> -#undef EMIT_VVAR + vclock_pages = vvar_start + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE; + pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE; + hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index e8c60ae7a7c8..0bab5f4af6d1 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -30,6 +30,8 @@ VERSION { #ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; #endif + getrandom; + __vdso_getrandom; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 76e4e74f35b5..f6d2d8aba643 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -57,7 +57,7 @@ __setup_param("vdso=", vdso_setup, vdso32_setup, 0); /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> -static struct ctl_table abi_table2[] = { +static const struct ctl_table abi_table2[] = { { .procname = "vsyscall32", .data = &vdso32_enabled, diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..bcba5639b8ee --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +.section .rodata, "a" +.align 16 +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 +.text + +/* + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number + * of blocks of output with a nonce of 0, taking an input key and 8-byte + * counter. Importantly does not spill to the stack. Its arguments are: + * + * rdi: output bytes + * rsi: 32-byte key input + * rdx: 8-byte counter input/output + * rcx: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +.set output, %rdi +.set key, %rsi +.set counter, %rdx +.set nblocks, %rcx +.set i, %al +/* xmm registers are *not* callee-save. */ +.set temp, %xmm0 +.set state0, %xmm1 +.set state1, %xmm2 +.set state2, %xmm3 +.set state3, %xmm4 +.set copy0, %xmm5 +.set copy1, %xmm6 +.set copy2, %xmm7 +.set copy3, %xmm8 +.set one, %xmm9 + + /* copy0 = "expand 32-byte k" */ + movaps CONSTANTS(%rip),copy0 + /* copy1,copy2 = key */ + movups 0x00(key),copy1 + movups 0x10(key),copy2 + /* copy3 = counter || zero nonce */ + movq 0x00(counter),copy3 + /* one = 1 || 0 */ + movq $1,%rax + movq %rax,one + +.Lblock: + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ + movdqa copy0,state0 + movdqa copy1,state1 + movdqa copy2,state2 + movdqa copy3,state3 + + movb $10,i +.Lpermute: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + pshufd $0x39,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[3,0,1,2] */ + pshufd $0x93,state3,state3 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + pshufd $0x93,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + pshufd $0x39,state3,state3 + + decb i + jnz .Lpermute + + /* output0 = state0 + copy0 */ + paddd copy0,state0 + movups state0,0x00(output) + /* output1 = state1 + copy1 */ + paddd copy1,state1 + movups state1,0x10(output) + /* output2 = state2 + copy2 */ + paddd copy2,state2 + movups state2,0x20(output) + /* output3 = state3 + copy3 */ + paddd copy3,state3 + movups state3,0x30(output) + + /* ++copy3.counter */ + paddq one,copy3 + + /* output += 64, --nblocks */ + addq $64,output + decq nblocks + jnz .Lblock + + /* counter = copy3.counter */ + movq copy3,0x00(counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + pxor state0,state0 + pxor state1,state1 + pxor state2,state2 + pxor state3,state3 + pxor copy1,copy1 + pxor copy2,copy2 + pxor temp,temp + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c new file mode 100644 index 000000000000..430862b8977c --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#include <linux/types.h> + +#include "../../../../lib/vdso/getrandom.c" + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} + +ssize_t getrandom(void *, size_t, unsigned int, void *, size_t) + __attribute__((weak, alias("__vdso_getrandom"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7645730dc228..39e6efc1a9ca 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -20,23 +20,20 @@ #include <asm/vgtod.h> #include <asm/proto.h> #include <asm/vdso.h> -#include <asm/vvar.h> #include <asm/tlb.h> #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> +#include <asm/vdso/vsyscall.h> #include <clocksource/hyperv_timer.h> -#undef _ASM_X86_VVAR_H -#define EMIT_VVAR(name, offset) \ - const size_t name ## _offset = offset; -#include <asm/vvar.h> - struct vdso_data *arch_get_vdso_data(void *vvar_page) { - return (struct vdso_data *)(vvar_page + _vdso_data_offset); + return (struct vdso_data *)vvar_page; } -#undef EMIT_VVAR + +static union vdso_data_store vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = vdso_data_store.data; unsigned int vclocks_used __read_mostly; @@ -51,7 +48,8 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len)); + image->alt_len), + NULL); return 0; } @@ -151,7 +149,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, if (sym_offset == image->sym_vvar_page) { struct page *timens_page = find_timens_vvar_page(vma); - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; /* * If a task belongs to a time namespace then a namespace @@ -179,32 +177,52 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, } return vmf_insert_pfn(vma, vmf->address, pfn); - } else if (sym_offset == image->sym_pvclock_page) { - struct pvclock_vsyscall_time_info *pvti = - pvclock_get_pvti_cpu0_va(); - if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { - return vmf_insert_pfn_prot(vma, vmf->address, - __pa(pvti) >> PAGE_SHIFT, - pgprot_decrypted(vma->vm_page_prot)); - } - } else if (sym_offset == image->sym_hvclock_page) { - pfn = hv_get_tsc_pfn(); - if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) - return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_timens_page) { struct page *timens_page = find_timens_vvar_page(vma); if (!timens_page) return VM_FAULT_SIGBUS; - pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; + pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; return vmf_insert_pfn(vma, vmf->address, pfn); } return VM_FAULT_SIGBUS; } +static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + switch (vmf->pgoff) { +#ifdef CONFIG_PARAVIRT_CLOCK + case VDSO_PAGE_PVCLOCK_OFFSET: + { + struct pvclock_vsyscall_time_info *pvti = + pvclock_get_pvti_cpu0_va(); + + if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) + return vmf_insert_pfn_prot(vma, vmf->address, + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); + break; + } +#endif /* CONFIG_PARAVIRT_CLOCK */ +#ifdef CONFIG_HYPERV_TIMER + case VDSO_PAGE_HVCLOCK_OFFSET: + { + unsigned long pfn = hv_get_tsc_pfn(); + + if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) + return vmf_insert_pfn(vma, vmf->address, pfn); + break; + } +#endif /* CONFIG_HYPERV_TIMER */ + } + + return VM_FAULT_SIGBUS; +} + static const struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .fault = vdso_fault, @@ -214,6 +232,10 @@ static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, }; +static const struct vm_special_mapping vvar_vclock_mapping = { + .name = "[vvar_vclock]", + .fault = vvar_vclock_fault, +}; /* * Add vdso and vvar mappings to current process. @@ -256,7 +278,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) vma = _install_special_mapping(mm, addr, - -image->sym_vvar_start, + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| VM_PFNMAP, &vvar_mapping); @@ -264,68 +286,30 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); - } else { - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; + goto up_fail; } -up_fail: - mmap_write_unlock(mm); - return ret; -} - -#ifdef CONFIG_X86_64 -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ - unsigned long addr, end; - unsigned offset; + vma = _install_special_mapping(mm, + addr + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, + &vvar_vclock_mapping); - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= DEFAULT_MAP_WINDOW) - end = DEFAULT_MAP_WINDOW; - end -= len; - - if (end > start) { - offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + do_munmap(mm, text_start, image->size, NULL); + do_munmap(mm, addr, image->size, NULL); + goto up_fail; } - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -} + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; -static int map_vdso_randomized(const struct vdso_image *image) -{ - unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); - - return map_vdso(image, addr); +up_fail: + mmap_write_unlock(mm); + return ret; } -#endif int map_vdso_once(const struct vdso_image *image, unsigned long addr) { @@ -343,7 +327,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || - vma_is_special_mapping(vma, &vvar_mapping)) { + vma_is_special_mapping(vma, &vvar_mapping) || + vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; } @@ -369,7 +354,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_64); + return map_vdso(&vdso_image_64, 0); } #ifdef CONFIG_COMPAT @@ -380,7 +365,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, if (x32) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(&vdso_image_x32); + return map_vdso(&vdso_image_x32, 0); } #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index e0ca8120aea8..2fb7d53cf333 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -76,7 +76,7 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); @@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) static bool write_ok_or_segv(unsigned long ptr, size_t size) { - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_err, this could go away. - */ - if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; @@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { - struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_err; long ret; unsigned long orig_dx; @@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code, goto sigsegv; } - tsk = current; - /* * Check for access_ok violations and find the syscall nr. * @@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code, goto do_ret; /* skip requested */ /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. + * With a real vsyscall, page faults cause SIGSEGV. */ - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; - current->thread.sig_on_uaccess_err = 1; - ret = -EFAULT; switch (vsyscall_nr) { case 0: @@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code, break; } - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; - check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ + goto sigsegv; } regs->ax = ret; |