diff options
Diffstat (limited to 'arch/x86/kernel/traps.c')
| -rw-r--r-- | arch/x86/kernel/traps.c | 393 |
1 files changed, 321 insertions, 72 deletions
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4fa0b17e5043..bcf1dedc1d00 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -31,6 +31,7 @@ #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> @@ -42,6 +43,7 @@ #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> +#include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -67,6 +69,7 @@ #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -91,6 +94,187 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, further decode to determine its use: + * + * FineIBT: d6 udb + * FineIBT: f0 75 f9 lock jne . - 6 + * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax + * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax + * static_call: 0f b9 cc ud1 %esp,%ecx + * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg + * + * Notable, since __WARN_trap can use all registers, the distinction between + * UD1 users is through R/M. + */ +__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) +{ + unsigned long start = addr; + u8 v, reg, rm, rex = 0; + int type = BUG_UD1; + bool lock = false; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + for (;;) { + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + continue; + + if (v == INSN_LOCK) { + lock = true; + continue; + } + + if ((v & 0xf0) == 0x40) { + rex = v; + continue; + } + + break; + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xd6: + *len = addr - start; + return BUG_UDB; + + case OPCODE_ESCAPE: + break; + + default: + return BUG_NONE; + } + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) { + *len = addr - start; + return BUG_UD2; + } + + if (v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + *imm = 0; + v = *(u8 *)(addr++); /* ModRM */ + + if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) + addr++; /* SIB */ + + reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); + rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); + + /* Decode immediate, if present */ + switch (X86_MODRM_MOD(v)) { + case 0: if (X86_MODRM_RM(v) == 5) + addr += 4; /* RIP + disp32 */ + + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; + + if (rm == 2) { /* (%edx) */ + *imm = reg; + type = BUG_UD1_WARN; + } + break; + + case 1: *imm = *(s8 *)addr; + addr += 1; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; + break; + + case 2: *imm = *(s32 *)addr; + addr += 4; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; + break; + + case 3: break; + } + + /* record instruction length */ + *len = addr - start; + + return type; +} + +static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) +{ + int offset = pt_regs_offset(regs, nr); + if (WARN_ON_ONCE(offset < -0)) + return 0; + return *((unsigned long *)((void *)regs + offset)); +} + +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS +DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); +EXPORT_STATIC_CALL_TRAMP(WARN_trap); + +/* + * Create a va_list from an exception context. + */ +void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) +{ + /* + * Register save area; populate with function call argument registers + */ + args->regs[0] = regs->di; + args->regs[1] = regs->si; + args->regs[2] = regs->dx; + args->regs[3] = regs->cx; + args->regs[4] = regs->r8; + args->regs[5] = regs->r9; + + /* + * From the ABI document: + * + * @gp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available general purpose + * argument register is saved. In case all argument registers have + * been exhausted, it is set to the value 48 (6*8). + * + * @fp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available floating point + * argument is saved. In case all argument registers have been + * exhausted, it is set to the value 176 (6*8 + 8*16) + * + * @overflow_arg_area - this pointer is used to fetch arguments passed + * on the stack. It is initialized with the address of the first + * argument passed on the stack, if any, and then always updated to + * point to the start of the next argument on the stack. + * + * @reg_save_area - the element points to the start of the register + * save area. + * + * Notably the vararg starts with the second argument and there are no + * floating point arguments in the kernel. + */ + args->args.gp_offset = 1*8; + args->args.fp_offset = 6*8 + 8*16; + args->args.reg_save_area = &args->regs; + args->args.overflow_arg_area = (void *)regs->sp; + + /* + * If the exception came from __WARN_trap, there is a return + * address on the stack, skip that. This is why any __WARN_trap() + * caller must inhibit tail-call optimization. + */ + if ((void *)regs->ip == &__WARN_trap) + args->args.overflow_arg_area += 8; + + return &args->args; +} +#endif /* HAVE_ARCH_BUG_FORMAT */ + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -215,15 +399,13 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; + int ud_type, ud_len; + s32 ud_imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(addr, &ud_imm, &ud_len); + if (ud_type == BUG_NONE) return handled; /* @@ -231,16 +413,63 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + + switch (ud_type) { + case BUG_UD1_WARN: + if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) + handled = true; + break; + + case BUG_UD2: + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + fallthrough; + + case BUG_UDB: + case BUG_LOCK: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + break; + + case BUG_UD1_UBSAN: + if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", + report_ubsan_failure(ud_imm), + (void *)regs->ip); + } + break; + + default: + break; + } + + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; } + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); @@ -331,6 +560,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, #endif /* + * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 + * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch + * between configs triggers objtool warnings. + * + * This is a temporary hack until we have compiler or plugin support for + * annotating noreturns. + */ +#ifdef CONFIG_X86_ESPFIX64 +#define always_true() true +#else +bool always_true(void); +bool __weak always_true(void) { return true; } +#endif + +/* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the @@ -465,7 +709,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault) pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); - panic("Machine halted."); + if (always_true()) + panic("Machine halted."); instrumentation_end(); } @@ -487,13 +732,23 @@ DEFINE_IDTENTRY(exc_bounds) enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, - GP_CANONICAL + GP_CANONICAL, + GP_LASS_VIOLATION, + GP_NULL_POINTER, +}; + +static const char * const kernel_gp_hint_help[] = { + [GP_NON_CANONICAL] = "probably for non-canonical address", + [GP_CANONICAL] = "maybe for address", + [GP_LASS_VIOLATION] = "probably LASS violation for address", + [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure - * out whether any part of the access to that address was non-canonical. + * out whether any part of the access to that address was non-canonical or + * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) @@ -515,14 +770,28 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, return GP_NO_HINT; #ifdef CONFIG_X86_64 + /* Operand is in the kernel half */ + if (*addr >= ~__VIRTUAL_MASK) + return GP_CANONICAL; + + /* The last byte of the operand is not in the user canonical half */ + if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) + return GP_NON_CANONICAL; + /* - * Check that: - * - the operand is not in the kernel half - * - the last byte of the operand is not in the user canonical half + * A NULL pointer dereference usually causes a #PF. However, it + * can result in a #GP when LASS is active. Provide the same + * hint in the rare case that the condition is hit without LASS. */ - if (*addr < ~__VIRTUAL_MASK && - *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) - return GP_NON_CANONICAL; + if (*addr < PAGE_SIZE) + return GP_NULL_POINTER; + + /* + * Assume that LASS caused the exception, because the address is + * canonical and in the user half. + */ + if (cpu_feature_enabled(X86_FEATURE_LASS)) + return GP_LASS_VIOLATION; #endif return GP_CANONICAL; @@ -602,7 +871,7 @@ static bool try_fixup_enqcmd_gp(void) if (current->pasid_activated) return false; - wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; @@ -685,9 +954,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", - (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" - : "maybe for address", - gp_addr); + kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it @@ -735,16 +1002,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. @@ -874,24 +1141,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. + * + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. * - * Keep it simple: clear DR6 immediately. + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. + * + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -973,9 +1248,9 @@ static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) */ unsigned long debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* @@ -1091,13 +1366,13 @@ out: /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED @@ -1116,7 +1391,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which - * debug_read_clear_dr6() returns for the IDT entry points. + * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); @@ -1131,7 +1406,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); @@ -1148,7 +1423,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -1239,11 +1514,11 @@ static bool handle_xfd_event(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; - rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) @@ -1402,34 +1677,8 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif -/* Do not enable FRED by default yet. */ -static bool enable_fred __ro_after_init = false; - -#ifdef CONFIG_X86_FRED -static int __init fred_setup(char *str) -{ - if (!str) - return -EINVAL; - - if (!cpu_feature_enabled(X86_FEATURE_FRED)) - return 0; - - if (!strcmp(str, "on")) - enable_fred = true; - else if (!strcmp(str, "off")) - enable_fred = false; - else - pr_warn("invalid FRED option: 'fred=%s'\n", str); - return 0; -} -early_param("fred", fred_setup); -#endif - void __init trap_init(void) { - if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) - setup_clear_cpu_cap(X86_FEATURE_FRED); - /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); @@ -1437,7 +1686,7 @@ void __init trap_init(void) sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ - cpu_init_exception_handling(); + cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) |
