From 495a91d0998367f4f079593f491bdfe8ef06838e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2023 09:11:40 -0500 Subject: x86/MCE/AMD: Split amd_mce_is_memory_error() Define helper functions for legacy and SMCA systems in order to reuse individual checks in later changes. Describe what each function is checking for, and correct the XEC bitmask for SMCA. No functional change intended. [ bp: Use "else in amd_mce_is_memory_error() to make the conditional balanced, for readability. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20230613141142.36801-2-yazen.ghannam@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c267f43de39e..c0699342cc74 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -713,17 +713,37 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -bool amd_mce_is_memory_error(struct mce *m) +/* + * DRAM ECC errors are reported in the Northbridge (bank 4) with + * Extended Error Code 8. + */ +static bool legacy_mce_is_memory_error(struct mce *m) +{ + return m->bank == 4 && XEC(m->status, 0x1f) == 8; +} + +/* + * DRAM ECC errors are reported in Unified Memory Controllers with + * Extended Error Code 0. + */ +static bool smca_mce_is_memory_error(struct mce *m) { enum smca_bank_types bank_type; - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + + if (XEC(m->status, 0x3f)) + return false; bank_type = smca_get_bank_type(m->extcpu, m->bank); - if (mce_flags.smca) - return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; - return m->bank == 4 && xec == 0x8; + return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2; +} + +bool amd_mce_is_memory_error(struct mce *m) +{ + if (mce_flags.smca) + return smca_mce_is_memory_error(m); + else + return legacy_mce_is_memory_error(m); } static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) -- cgit From 48da1ad8ba95ecd35d76355594c629f3ef2a954a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2023 09:11:41 -0500 Subject: x86/mce: Define amd_mce_usable_address() Currently, all valid MCA_ADDR values are assumed to be usable on AMD systems. However, this is not correct in most cases. Notifiers expecting usable addresses may then operate on inappropriate values. Define a helper function to do AMD-specific checks for a usable memory address. List out all known cases. [ bp: Tone down the capitalized words. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230613141142.36801-3-yazen.ghannam@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mce/core.c | 3 +++ arch/x86/kernel/cpu/mce/internal.h | 2 ++ 3 files changed, 43 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c0699342cc74..f3517b8a8e91 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -746,6 +746,44 @@ bool amd_mce_is_memory_error(struct mce *m) return legacy_mce_is_memory_error(m); } +/* + * AMD systems do not have an explicit indicator that the value in MCA_ADDR is + * a system physical address. Therefore, individual cases need to be detected. + * Future cases and checks will be added as needed. + * + * 1) General case + * a) Assume address is not usable. + * 2) Poison errors + * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy + * northbridge (bank 4). + * b) Refers to poison consumption in the core. Does not include "no action", + * "action optional", or "deferred" error severities. + * c) Will include a usable address so that immediate action can be taken. + * 3) Northbridge DRAM ECC errors + * a) Reported in legacy bank 4 with extended error code (XEC) 8. + * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, + * this bit should not be checked. + * + * NOTE: SMCA UMC memory errors fall into case #1. + */ +bool amd_mce_usable_address(struct mce *m) +{ + /* Check special northbridge case 3) first. */ + if (!mce_flags.smca) { + if (legacy_mce_is_memory_error(m)) + return true; + else if (m->bank == 4) + return false; + } + + /* Check poison bit for all other bank types. */ + if (m->status & MCI_STATUS_POISON) + return true; + + /* Assume address is not usable for all others. */ + return false; +} + static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc14..06c21f571d5a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -464,6 +464,9 @@ int mce_usable_address(struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) return 0; + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_mce_usable_address(m); + /* Checks after this one are Intel/Zhaoxin-specific: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index bcf1b3c66c9c..a191554d98f2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -210,6 +210,7 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); +bool amd_mce_usable_address(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -237,6 +238,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m) #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void smca_extract_err_addr(struct mce *m) { } #endif -- cgit From 1bae0cfe4a171ccc5f731426296e45beafa096b8 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Jun 2023 09:11:42 -0500 Subject: x86/mce: Cleanup mce_usable_address() Move Intel-specific checks into a helper function. Explicitly use "bool" for return type. No functional change intended. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230613141142.36801-4-yazen.ghannam@amd.com --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mce/core.c | 33 ++++++++++----------------------- arch/x86/kernel/cpu/mce/intel.c | 20 ++++++++++++++++++++ arch/x86/kernel/cpu/mce/internal.h | 2 ++ 4 files changed, 33 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e..6de6e1d95952 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {} int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 06c21f571d5a..0214d4232346 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -453,35 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granularity for now. - */ -int mce_usable_address(struct mce *m) +bool mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) - return 0; + return false; - if (m->cpuvendor == X86_VENDOR_AMD) + switch (m->cpuvendor) { + case X86_VENDOR_AMD: return amd_mce_usable_address(m); - /* Checks after this one are Intel/Zhaoxin-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; - - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; - - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: + return intel_mce_usable_address(m); - return 1; + default: + return true; + } } EXPORT_SYMBOL_GPL(mce_usable_address); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f5323551c1a9..52bce533ddcc 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m) return false; } + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granularity for now. + */ +bool intel_mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV)) + return false; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return false; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return false; + + return true; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index a191554d98f2..e13a26c9c0ac 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -49,6 +49,7 @@ void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); +bool intel_mce_usable_address(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } static inline bool intel_filter_mce(struct mce *m) { return false; } +static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); -- cgit