diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mce/core.c')
| -rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 1059 |
1 files changed, 591 insertions, 468 deletions
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c8ec5c71712..34440021e8cf 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,14 +44,18 @@ #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> +#include <linux/kexec.h> +#include <linux/vmcore_info.h> -#include <asm/intel-family.h> +#include <asm/fred.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> +#include <asm/tdx.h> #include "internal.h" @@ -67,13 +71,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); -struct mce_bank { - u64 ctl; /* subevents to enable */ - - __u64 init : 1, /* initialise bank? */ - __reserved_1 : 63; -}; -static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); +DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); #define ATTR_LEN 16 /* One object for each MCE bank, shared by all CPUs */ @@ -91,7 +89,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -120,28 +118,41 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).phys_proc_id; - m->apicid = cpu_data(m->extcpu).initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -162,8 +173,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -190,6 +203,10 @@ static void __print_mce(struct mce *m) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -205,9 +222,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -234,11 +253,20 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static const char *mce_dump_aux_info(struct mce *m) +{ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) + return tdx_dump_mce_info(m); + + return NULL; +} + +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + const char *memmsg; /* * Allow instrumentation around external facilities usage. Not that it @@ -264,20 +292,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -285,13 +315,33 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); + + memmsg = mce_dump_aux_info(&final->m); + if (memmsg) + pr_emerg(HW_ERR "Machine check: %s\n", memmsg); + if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { + struct page *p; + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -339,9 +389,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -374,7 +424,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -395,7 +445,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -407,17 +457,19 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(err); instrumentation_end(); - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m = &err->m; + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -437,14 +489,14 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } -int mce_available(struct cpuinfo_x86 *c) +bool mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) - return 0; + return false; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -459,32 +511,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; + return false; - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + return amd_mce_usable_address(m); - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); @@ -543,16 +585,38 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +static bool mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return true; + } + + return false; +} + static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -579,7 +643,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, mce->severity != MCE_DEFERRED_SEVERITY) return NOTIFY_DONE; - pfn = mce->addr >> PAGE_SHIFT; + pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; @@ -596,13 +660,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -616,13 +680,18 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + if (m->kflags & MCE_CHECK_DFR_REGS) + m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i)); + else + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -633,28 +702,103 @@ static noinstr void mce_read_aux(struct mce *m, int i) m->addr <<= shift; } - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m->addr >> 56) & 0x3f; - - m->addr &= GENMASK_ULL(55, lsb); - } + smca_extract_err_addr(m); } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + if (m->status & MCI_STATUS_SYNDV) { + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static bool smca_should_log_poll_error(struct mce *m) +{ + if (m->status & MCI_STATUS_VAL) + return true; + + m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank)); + if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) { + m->kflags |= MCE_CHECK_DFR_REGS; + return true; + } + + return false; +} + +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + if (mce_flags.smca) + return smca_should_log_poll_error(m); + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + +/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * @@ -669,98 +813,63 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - bool error_seen = false; - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); - - /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) - goto log_it; + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. */ - continue; + if (!mca_cfg.cmci_disabled) + mce_track_storm(m); -log_it: - error_seen = true; - - if (flags & MCP_DONTLOG) - goto clear_it; + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) + continue; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -769,8 +878,6 @@ clear_it: */ sync_core(); - - return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -820,8 +927,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -832,7 +939,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -843,8 +950,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -857,17 +964,38 @@ static noinstr bool quirk_skylake_repmov(void) } /* + * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption + * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. + * + * However, the context is still valid, so save the "cs" register for later use. + * + * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. + * + * The Instruction Fetch Unit is at MCA bank 1 for all affected systems. + */ +static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 1) + return; + if (!(m->status & MCI_STATUS_POISON)) + return; + + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -875,9 +1003,12 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo if (mce_flags.snb_ifu_quirk) quirk_sandybridge_ifu(i, m, regs); + if (mce_flags.zen_ifu_quirk) + quirk_zen_ifu(i, m, regs); + m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -968,10 +1099,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -979,11 +1111,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -995,7 +1129,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1012,11 +1146,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1036,12 +1170,12 @@ static noinstr int mce_start(int *no_way_out) if (!timeout) return ret; - arch_atomic_add(*no_way_out, &global_nwo); + raw_atomic_add(*no_way_out, &global_nwo); /* * Rely on the implied barrier below, such that global_nwo * is updated before mce_callin. */ - order = arch_atomic_inc_return(&mce_callin); + order = raw_atomic_inc_return(&mce_callin); arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* Enable instrumentation around calls to external facilities */ @@ -1050,10 +1184,10 @@ static noinstr int mce_start(int *no_way_out) /* * Wait for everyone. */ - while (arch_atomic_read(&mce_callin) != num_online_cpus()) { + while (raw_atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { - arch_atomic_set(&global_nwo, 0); + raw_atomic_set(&global_nwo, 0); goto out; } ndelay(SPINUNIT); @@ -1068,7 +1202,7 @@ static noinstr int mce_start(int *no_way_out) /* * Monarch: Starts executing now, the others wait. */ - arch_atomic_set(&mce_executing, 1); + raw_atomic_set(&mce_executing, 1); } else { /* * Subject: Now start the scanning loop one by one in @@ -1076,10 +1210,10 @@ static noinstr int mce_start(int *no_way_out) * This way when there are any shared banks it will be * only seen by one CPU before cleared, avoiding duplicates. */ - while (arch_atomic_read(&mce_executing) < order) { + while (raw_atomic_read(&mce_executing) < order) { if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { - arch_atomic_set(&global_nwo, 0); + raw_atomic_set(&global_nwo, 0); goto out; } ndelay(SPINUNIT); @@ -1089,7 +1223,7 @@ static noinstr int mce_start(int *no_way_out) /* * Cache the global no_way_out state. */ - *no_way_out = arch_atomic_read(&global_nwo); + *no_way_out = raw_atomic_read(&global_nwo); ret = order; @@ -1180,7 +1314,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1204,7 +1338,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1212,7 +1346,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1220,13 +1354,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1240,7 +1375,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1271,7 +1406,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1281,17 +1416,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1308,6 +1443,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + unsigned long pfn; int ret; p->mce_count = 0; @@ -1316,9 +1452,10 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + ret = memory_failure(pfn, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + set_mce_nospec(pfn); sync_core(); return; } @@ -1340,16 +1477,19 @@ static void kill_me_maybe(struct callback_head *cb) static void kill_me_never(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + unsigned long pfn; p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1362,11 +1502,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1415,8 +1556,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1454,13 +1597,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1469,15 +1613,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1488,12 +1632,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1508,7 +1652,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1520,8 +1664,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1539,15 +1683,33 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (kill_current_task) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); + + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from @@ -1558,20 +1720,23 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: + /* Given it didn't panic, mark it as recoverable */ + hwerr_log_error_type(HWERR_RECOV_OTHERS); + instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1598,13 +1763,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1618,6 +1776,18 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } +static void mc_poll_banks_default(void) +{ + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); +} + +void (*mc_poll_banks)(void) = mc_poll_banks_default; + +static bool should_enable_timer(unsigned long iv) +{ + return !mca_cfg.ignore_ce && iv; +} + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1627,14 +1797,8 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); - - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } + if (mce_available(this_cpu_ptr(&cpu_info))) + mc_poll_banks(); /* * Alert userspace if needed. If we logged an MCE, reduce the polling @@ -1645,55 +1809,39 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else if (should_enable_timer(iv)) { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } -/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +/* Must not be called in IRQ context where timer_delete_sync() can deadlock */ static void mce_timer_delete_all(void) { int cpu; for_each_online_cpu(cpu) - del_timer_sync(&per_cpu(mce_timer, cpu)); -} - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return 1; - } - return 0; + timer_delete_sync(&per_cpu(mce_timer, cpu)); } -EXPORT_SYMBOL_GPL(mce_notify_irq); static void __mcheck_cpu_mce_banks_init(void) { @@ -1705,9 +1853,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1722,7 +1871,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1735,69 +1884,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1805,145 +1919,97 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * Various K7s with broken bank 0 around. Always disable - * by default. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; - - /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. - */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; - + mca_cfg.bootlog = 0; } - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; - if (c->x86 == 6 && c->x86_model == 45) - mce_flags.snb_ifu_quirk = 1; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; - } + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; - if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } - } + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} - return 0; +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; + } } -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return 0; + return false; switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); mce_flags.p5 = 1; - return 1; + return true; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); mce_flags.winchip = 1; - return 1; + return true; default: - return 0; + return false; } - return 0; -} - -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } + return false; } static void mce_centaur_feature_init(struct cpuinfo_x86 *c) @@ -1982,7 +2048,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -1995,16 +2060,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); break; - } + case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); + mce_amd_feature_init(c); break; case X86_VENDOR_CENTAUR: @@ -2040,11 +2100,10 @@ static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (should_enable_timer(iv)) { + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } static void __mcheck_cpu_setup_timer(void) @@ -2121,6 +2180,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) @@ -2136,6 +2220,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2153,12 +2284,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = 1; - return; - } - - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; @@ -2166,12 +2292,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2295,7 +2420,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2318,13 +2443,13 @@ static void vendor_disable_error_reporting(void) mce_disable_error_reporting(); } -static int mce_syscore_suspend(void) +static int mce_syscore_suspend(void *data) { vendor_disable_error_reporting(); return 0; } -static void mce_syscore_shutdown(void) +static void mce_syscore_shutdown(void *data) { vendor_disable_error_reporting(); } @@ -2334,19 +2459,24 @@ static void mce_syscore_shutdown(void) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static void mce_syscore_resume(void) +static void mce_syscore_resume(void *data) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } -static struct syscore_ops mce_syscore_ops = { +static const struct syscore_ops mce_syscore_ops = { .suspend = mce_syscore_suspend, .shutdown = mce_syscore_shutdown, .resume = mce_syscore_resume, }; +static struct syscore mce_syscore = { + .ops = &mce_syscore_ops, +}; + /* * mce_device: Sysfs support */ @@ -2356,8 +2486,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ @@ -2365,6 +2496,7 @@ static void mce_restart(void) { mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); + mce_schedule_work(); } /* Toggle features for corrected errors */ @@ -2385,7 +2517,7 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct bus_type mce_subsys = { +static const struct bus_type mce_subsys = { .name = "machinecheck", .dev_name = "machinecheck", }; @@ -2428,12 +2560,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } @@ -2554,9 +2688,6 @@ static int mce_device_create(unsigned int cpu) int err; int i, j; - if (!mce_available(&boot_cpu_data)) - return -EIO; - dev = per_cpu(mce_device, cpu); if (dev) return 0; @@ -2645,14 +2776,12 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); @@ -2662,15 +2791,9 @@ static int mce_cpu_dead(unsigned int cpu) static int mce_cpu_online(unsigned int cpu) { struct timer_list *t = this_cpu_ptr(&mce_timer); - int ret; mce_device_create(cpu); - - ret = mce_threshold_create_device(cpu); - if (ret) { - mce_device_remove(cpu); - return ret; - } + mce_threshold_create_device(cpu); mce_reenable_cpu(); mce_start_timer(t); return 0; @@ -2681,7 +2804,7 @@ static int mce_cpu_pre_down(unsigned int cpu) struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); - del_timer_sync(t); + timer_delete_sync(t); mce_threshold_remove_device(cpu); mce_device_remove(cpu); return 0; @@ -2754,7 +2877,7 @@ static __init int mcheck_init_device(void) if (err < 0) goto err_out_online; - register_syscore_ops(&mce_syscore_ops); + register_syscore(&mce_syscore); return 0; |
