summaryrefslogtreecommitdiff
path: root/drivers/edac/mce_amd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/edac/mce_amd.c')
-rw-r--r--drivers/edac/mce_amd.c389
1 files changed, 253 insertions, 136 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 30f7309446a6..af3c12284a1e 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1,34 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/slab.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
#include "mce_amd.h"
-static struct amd_decoder_ops *fam_ops;
+static struct amd_decoder_ops fam_ops;
static u8 xec_mask = 0xf;
-static u8 nb_err_cpumask = 0xf;
-
-static bool report_gart_errors;
-static void (*nb_bus_decoder)(int node_id, struct mce *m);
-void amd_report_gart_errors(bool v)
-{
- report_gart_errors = v;
-}
-EXPORT_SYMBOL_GPL(amd_report_gart_errors);
+static void (*decode_dram_ecc)(int node_id, struct mce *m);
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
{
- nb_bus_decoder = f;
+ decode_dram_ecc = f;
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
{
- if (nb_bus_decoder) {
- WARN_ON(nb_bus_decoder != f);
+ if (decode_dram_ecc) {
+ WARN_ON(decode_dram_ecc != f);
- nb_bus_decoder = NULL;
+ decode_dram_ecc = NULL;
}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
@@ -79,7 +75,8 @@ static const char * const f15h_mc1_mce_desc[] = {
"uop queue",
"insn buffer",
"predecode buffer",
- "fetch address FIFO"
+ "fetch address FIFO",
+ "dispatch uop queue"
};
static const char * const f15h_mc2_mce_desc[] = {
@@ -138,6 +135,15 @@ static const char * const mc5_mce_desc[] = {
"Retire status queue"
};
+static const char * const mc6_mce_desc[] = {
+ "Hardware Assertion",
+ "Free List",
+ "Physical Register File",
+ "Retire Queue",
+ "Scheduler table",
+ "Status Register File",
+};
+
static bool f12h_mc0_mce(u16 ec, u8 xec)
{
bool ret = false;
@@ -268,6 +274,12 @@ static bool f15h_mc0_mce(u16 ec, u8 xec)
pr_cont("System Read Data Error.\n");
else
pr_cont(" Internal error condition type %d.\n", xec);
+ } else if (INT_ERROR(ec)) {
+ if (xec <= 0x1f)
+ pr_cont("Hardware Assert.\n");
+ else
+ ret = false;
+
} else
ret = false;
@@ -289,7 +301,7 @@ static void decode_mc0_mce(struct mce *m)
: (xec ? "multimatch" : "parity")));
return;
}
- } else if (fam_ops->mc0_mce(ec, xec))
+ } else if (fam_ops.mc0_mce(ec, xec))
;
else
pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
@@ -374,7 +386,7 @@ static bool f15h_mc1_mce(u16 ec, u8 xec)
pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
break;
- case 0x11 ... 0x14:
+ case 0x11 ... 0x15:
pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
break;
@@ -398,10 +410,20 @@ static void decode_mc1_mce(struct mce *m)
bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
- } else if (fam_ops->mc1_mce(ec, xec))
+ } else if (INT_ERROR(ec)) {
+ if (xec <= 0x3f)
+ pr_cont("Hardware Assert.\n");
+ else
+ goto wrong_mc1_mce;
+ } else if (fam_ops.mc1_mce(ec, xec))
;
else
- pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
+ goto wrong_mc1_mce;
+
+ return;
+
+wrong_mc1_mce:
+ pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
}
static bool k8_mc2_mce(u16 ec, u8 xec)
@@ -416,8 +438,8 @@ static bool k8_mc2_mce(u16 ec, u8 xec)
pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
else if (xec == 0x0) {
if (TLB_ERROR(ec))
- pr_cont(": %s error in a Page Descriptor Cache or "
- "Guest TLB.\n", TT_MSG(ec));
+ pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
+ TT_MSG(ec));
else if (BUS_ERROR(ec))
pr_cont(": %s/ECC error in data read from NB: %s.\n",
R4_MSG(ec), PP_MSG(ec));
@@ -469,6 +491,11 @@ static bool f15h_mc2_mce(u16 ec, u8 xec)
default:
ret = false;
}
+ } else if (INT_ERROR(ec)) {
+ if (xec <= 0x3f)
+ pr_cont("Hardware Assert.\n");
+ else
+ ret = false;
}
return ret;
@@ -522,7 +549,7 @@ static void decode_mc2_mce(struct mce *m)
pr_emerg(HW_ERR "MC2 Error: ");
- if (!fam_ops->mc2_mce(ec, xec))
+ if (!fam_ops.mc2_mce(ec, xec))
pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
}
@@ -557,8 +584,8 @@ static void decode_mc3_mce(struct mce *m)
static void decode_mc4_mce(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- int node_id = amd_get_nb_id(m->extcpu);
+ unsigned int fam = x86_family(m->cpuid);
+ int node_id = topology_amd_node_id(m->extcpu);
u16 ec = EC(m->status);
u8 xec = XEC(m->status, 0x1f);
u8 offset = 0;
@@ -571,13 +598,13 @@ static void decode_mc4_mce(struct mce *m)
/* special handling for DRAM ECCs */
if (xec == 0x0 || xec == 0x8) {
/* no ECCs on F11h */
- if (c->x86 == 0x11)
+ if (fam == 0x11)
goto wrong_mc4_mce;
pr_cont("%s.\n", mc4_mce_desc[xec]);
- if (nb_bus_decoder)
- nb_bus_decoder(node_id, m);
+ if (decode_dram_ecc)
+ decode_dram_ecc(node_id, m);
return;
}
break;
@@ -592,7 +619,7 @@ static void decode_mc4_mce(struct mce *m)
return;
case 0x19:
- if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
+ if (fam == 0x15 || fam == 0x16)
pr_cont("Compute Unit Data Error.\n");
else
goto wrong_mc4_mce;
@@ -615,14 +642,23 @@ static void decode_mc4_mce(struct mce *m)
static void decode_mc5_mce(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
+ unsigned int fam = x86_family(m->cpuid);
+ u16 ec = EC(m->status);
u8 xec = XEC(m->status, xec_mask);
- if (c->x86 == 0xf || c->x86 == 0x11)
+ if (fam == 0xf || fam == 0x11)
goto wrong_mc5_mce;
pr_emerg(HW_ERR "MC5 Error: ");
+ if (INT_ERROR(ec)) {
+ if (xec <= 0x1f) {
+ pr_cont("Hardware Assert.\n");
+ return;
+ } else
+ goto wrong_mc5_mce;
+ }
+
if (xec == 0x0 || xec == 0xc)
pr_cont("%s.\n", mc5_mce_desc[xec]);
else if (xec <= 0xd)
@@ -642,38 +678,76 @@ static void decode_mc6_mce(struct mce *m)
pr_emerg(HW_ERR "MC6 Error: ");
- switch (xec) {
- case 0x1:
- pr_cont("Free List");
- break;
+ if (xec > 0x5)
+ goto wrong_mc6_mce;
- case 0x2:
- pr_cont("Physical Register File");
- break;
+ pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
+ return;
- case 0x3:
- pr_cont("Retire Queue");
- break;
+ wrong_mc6_mce:
+ pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
+}
- case 0x4:
- pr_cont("Scheduler table");
- break;
+static const char * const smca_long_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit",
+ [SMCA_IF] = "Instruction Fetch Unit",
+ [SMCA_L2_CACHE] = "L2 Cache",
+ [SMCA_DE] = "Decode Unit",
+ [SMCA_RESERVED] = "Reserved",
+ [SMCA_EX] = "Execution Unit",
+ [SMCA_FP] = "Floating Point Unit",
+ [SMCA_L3_CACHE] = "L3 Cache",
+ [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave",
+ [SMCA_PIE] = "Power, Interrupts, etc.",
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = "Unified Memory Controller",
+ [SMCA_UMC_V2] = "Unified Memory Controller v2",
+ [SMCA_PB] = "Parameter Block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor",
+ [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit",
+ [SMCA_MP5] = "Microprocessor 5 Unit",
+ [SMCA_MPDMA] = "MPDMA Unit",
+ [SMCA_NBIO] = "Northbridge IO Unit",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit",
+ [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit",
+ [SMCA_NBIF] = "NBIF Unit",
+ [SMCA_SHUB] = "System Hub Unit",
+ [SMCA_SATA] = "SATA Unit",
+ [SMCA_USB] = "USB Unit",
+ [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit",
+ [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit",
+ [SMCA_WAFL_PHY] = "WAFL PHY Unit",
+ [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit",
+};
- case 0x5:
- pr_cont("Status Register File");
- break;
+static const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
- default:
- goto wrong_mc6_mce;
- break;
- }
+ return smca_long_names[t];
+}
- pr_cont(" parity error.\n");
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_error(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ u8 xec = XEC(m->status, xec_mask);
- return;
+ if (bank_type >= N_SMCA_BANK_TYPES)
+ return;
- wrong_mc6_mce:
- pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
+ if (bank_type == SMCA_RESERVED) {
+ pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
+ return;
+ }
+
+ pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
+
+ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
+ xec == 0 && decode_dram_ecc)
+ decode_dram_ecc(topology_amd_node_id(m->extcpu), m);
}
static inline void amd_decode_err_code(u16 ec)
@@ -700,22 +774,6 @@ static inline void amd_decode_err_code(u16 ec)
pr_cont("\n");
}
-/*
- * Filter out unwanted MCE signatures here.
- */
-static bool amd_filter_mce(struct mce *m)
-{
- u8 xec = (m->status >> 16) & 0x1f;
-
- /*
- * NB GART TLB error reporting is disabled by default.
- */
- if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
- return true;
-
- return false;
-}
-
static const char *decode_error_status(struct mce *m)
{
if (m->status & MCI_STATUS_UC) {
@@ -727,19 +785,97 @@ static const char *decode_error_status(struct mce *m)
}
if (m->status & MCI_STATUS_DEFERRED)
- return "Deferred error.";
+ return "Deferred error, no action required.";
return "Corrected error, no action required.";
}
-int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
+static int
+amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *m = (struct mce *)data;
- struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
+ struct mce_hw_err *err = to_mce_hw_err(m);
+ unsigned int fam = x86_family(m->cpuid);
+ u32 mca_config_lo = 0, dummy;
int ecc;
- if (amd_filter_mce(m))
- return NOTIFY_STOP;
+ if (m->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
+ pr_emerg(HW_ERR "%s\n", decode_error_status(m));
+
+ pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
+ m->extcpu,
+ fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
+ m->bank,
+ ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
+ ((m->status & MCI_STATUS_UC) ? "UE" :
+ (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
+ ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
+ ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
+ ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(m->bank), &mca_config_lo, &dummy);
+
+ if (mca_config_lo & MCI_CONFIG_MCAX)
+ pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+
+ pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
+ }
+
+ /* do the two bits[14:13] together */
+ ecc = (m->status >> 45) & 0x3;
+ if (ecc)
+ pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
+
+ if (fam >= 0x15) {
+ pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
+
+ /* F15h, bank4, bit 43 is part of McaStatSubCache. */
+ if (fam != 0x15 || m->bank != 4)
+ pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
+ }
+
+ if (fam >= 0x17)
+ pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
+
+ pr_cont("]: 0x%016llx\n", m->status);
+
+ if (m->status & MCI_STATUS_ADDRV)
+ pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
+
+ if (m->ppin)
+ pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
+
+ if (m->status & MCI_STATUS_SYNDV) {
+ pr_cont(", Syndrome: 0x%016llx\n", m->synd);
+ if (mca_config_lo & MCI_CONFIG_FRUTEXT) {
+ char frutext[17];
+
+ frutext[16] = '\0';
+ memcpy(&frutext[0], &err->vendor.amd.synd1, 8);
+ memcpy(&frutext[8], &err->vendor.amd.synd2, 8);
+
+ pr_emerg(HW_ERR "FRU Text: %s", frutext);
+ }
+ }
+
+ pr_cont("\n");
+
+ decode_smca_error(m);
+ goto err_code;
+ }
+
+ if (m->tsc)
+ pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
+
+ /* Doesn't matter which member to test. */
+ if (!fam_ops.mc0_mce)
+ goto err_code;
switch (m->bank) {
case 0:
@@ -774,109 +910,91 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
break;
}
- pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m));
-
- pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
- m->extcpu,
- c->x86, c->x86_model, c->x86_mask,
- m->bank,
- ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
- ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
- ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
- ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
- ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
-
- if (c->x86 == 0x15 || c->x86 == 0x16)
- pr_cont("|%s|%s",
- ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
- ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
-
- /* do the two bits[14:13] together */
- ecc = (m->status >> 45) & 0x3;
- if (ecc)
- pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
-
- pr_cont("]: 0x%016llx\n", m->status);
-
- if (m->status & MCI_STATUS_ADDRV)
- pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
-
+ err_code:
amd_decode_err_code(m->status & 0xffff);
- return NOTIFY_STOP;
+ m->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}
-EXPORT_SYMBOL_GPL(amd_decode_mce);
static struct notifier_block amd_mce_dec_nb = {
.notifier_call = amd_decode_mce,
+ .priority = MCE_PRIO_EDAC,
};
static int __init mce_amd_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (c->x86_vendor != X86_VENDOR_AMD)
- return 0;
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
+ return -ENODEV;
- if (c->x86 < 0xf || c->x86 > 0x16)
- return 0;
+ if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
- fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
- if (!fam_ops)
- return -ENOMEM;
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ xec_mask = 0x3f;
+ goto out;
+ }
switch (c->x86) {
case 0xf:
- fam_ops->mc0_mce = k8_mc0_mce;
- fam_ops->mc1_mce = k8_mc1_mce;
- fam_ops->mc2_mce = k8_mc2_mce;
+ fam_ops.mc0_mce = k8_mc0_mce;
+ fam_ops.mc1_mce = k8_mc1_mce;
+ fam_ops.mc2_mce = k8_mc2_mce;
break;
case 0x10:
- fam_ops->mc0_mce = f10h_mc0_mce;
- fam_ops->mc1_mce = k8_mc1_mce;
- fam_ops->mc2_mce = k8_mc2_mce;
+ fam_ops.mc0_mce = f10h_mc0_mce;
+ fam_ops.mc1_mce = k8_mc1_mce;
+ fam_ops.mc2_mce = k8_mc2_mce;
break;
case 0x11:
- fam_ops->mc0_mce = k8_mc0_mce;
- fam_ops->mc1_mce = k8_mc1_mce;
- fam_ops->mc2_mce = k8_mc2_mce;
+ fam_ops.mc0_mce = k8_mc0_mce;
+ fam_ops.mc1_mce = k8_mc1_mce;
+ fam_ops.mc2_mce = k8_mc2_mce;
break;
case 0x12:
- fam_ops->mc0_mce = f12h_mc0_mce;
- fam_ops->mc1_mce = k8_mc1_mce;
- fam_ops->mc2_mce = k8_mc2_mce;
+ fam_ops.mc0_mce = f12h_mc0_mce;
+ fam_ops.mc1_mce = k8_mc1_mce;
+ fam_ops.mc2_mce = k8_mc2_mce;
break;
case 0x14:
- nb_err_cpumask = 0x3;
- fam_ops->mc0_mce = cat_mc0_mce;
- fam_ops->mc1_mce = cat_mc1_mce;
- fam_ops->mc2_mce = k8_mc2_mce;
+ fam_ops.mc0_mce = cat_mc0_mce;
+ fam_ops.mc1_mce = cat_mc1_mce;
+ fam_ops.mc2_mce = k8_mc2_mce;
break;
case 0x15:
- xec_mask = 0x1f;
- fam_ops->mc0_mce = f15h_mc0_mce;
- fam_ops->mc1_mce = f15h_mc1_mce;
- fam_ops->mc2_mce = f15h_mc2_mce;
+ xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
+
+ fam_ops.mc0_mce = f15h_mc0_mce;
+ fam_ops.mc1_mce = f15h_mc1_mce;
+ fam_ops.mc2_mce = f15h_mc2_mce;
break;
case 0x16:
xec_mask = 0x1f;
- fam_ops->mc0_mce = cat_mc0_mce;
- fam_ops->mc1_mce = cat_mc1_mce;
- fam_ops->mc2_mce = f16h_mc2_mce;
+ fam_ops.mc0_mce = cat_mc0_mce;
+ fam_ops.mc1_mce = cat_mc1_mce;
+ fam_ops.mc2_mce = f16h_mc2_mce;
break;
+ case 0x17:
+ case 0x18:
+ pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
+ return -EINVAL;
+
default:
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
- kfree(fam_ops);
return -EINVAL;
}
+out:
pr_info("MCE: In-kernel MCE decoding enabled.\n");
mce_register_decode_chain(&amd_mce_dec_nb);
@@ -889,7 +1007,6 @@ early_initcall(mce_amd_init);
static void __exit mce_amd_exit(void)
{
mce_unregister_decode_chain(&amd_mce_dec_nb);
- kfree(fam_ops);
}
MODULE_DESCRIPTION("AMD MCE decoder");