summaryrefslogtreecommitdiff
path: root/drivers/edac/skx_common.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/edac/skx_common.c')
-rw-r--r--drivers/edac/skx_common.c206
1 files changed, 172 insertions, 34 deletions
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 9c5b6f8bd8bd..c9ade45c1a99 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -19,6 +19,7 @@
#include <linux/adxl.h>
#include <acpi/nfit.h>
#include <asm/mce.h>
+#include <asm/uv/uv.h>
#include "edac_module.h"
#include "skx_common.h"
@@ -47,8 +48,9 @@ static skx_show_retry_log_f skx_show_retry_rd_err_log;
static u64 skx_tolm, skx_tohm;
static LIST_HEAD(dev_edac_list);
static bool skx_mem_cfg_2lm;
+static struct res_config *skx_res_cfg;
-int __init skx_adxl_get(void)
+int skx_adxl_get(void)
{
const char * const *names;
int i, j;
@@ -110,14 +112,46 @@ err:
return -ENODEV;
}
+EXPORT_SYMBOL_GPL(skx_adxl_get);
-void __exit skx_adxl_put(void)
+void skx_adxl_put(void)
{
+ adxl_component_count = 0;
kfree(adxl_values);
kfree(adxl_msg);
}
+EXPORT_SYMBOL_GPL(skx_adxl_put);
-static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
+static void skx_init_mc_mapping(struct skx_dev *d)
+{
+ /*
+ * By default, the BIOS presents all memory controllers within each
+ * socket to the EDAC driver. The physical indices are the same as
+ * the logical indices of the memory controllers enumerated by the
+ * EDAC driver.
+ */
+ for (int i = 0; i < NUM_IMC; i++)
+ d->mc_mapping[i] = i;
+}
+
+void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
+{
+ edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+ pmc, lmc);
+
+ d->mc_mapping[pmc] = lmc;
+}
+EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
+
+static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
+{
+ edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+ pmc, d->mc_mapping[pmc]);
+
+ return d->mc_mapping[pmc];
+}
+
+static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
{
struct skx_dev *d;
int i, len = 0;
@@ -133,8 +167,24 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
return false;
}
+ /*
+ * GNR with a Flat2LM memory configuration may mistakenly classify
+ * a near-memory error(DDR5) as a far-memory error(CXL), resulting
+ * in the incorrect selection of decoded ADXL components.
+ * To address this, prefetch the decoded far-memory controller ID
+ * and adjust the error source to near-memory if the far-memory
+ * controller ID is invalid.
+ */
+ if (skx_res_cfg && skx_res_cfg->type == GNR && err_src == ERR_SRC_2LM_FM) {
+ res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+ if (res->imc == -1) {
+ err_src = ERR_SRC_2LM_NM;
+ edac_dbg(0, "Adjust the error source to near-memory.\n");
+ }
+ }
+
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
- if (error_in_1st_level_mem) {
+ if (err_src == ERR_SRC_2LM_NM) {
res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
(int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
@@ -168,6 +218,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
return false;
}
+ res->imc = skx_get_mc_mapping(d, res->imc);
+
for (i = 0; i < adxl_component_count; i++) {
if (adxl_values[i] == ~0x0ull)
continue;
@@ -187,38 +239,66 @@ void skx_set_mem_cfg(bool mem_cfg_2lm)
{
skx_mem_cfg_2lm = mem_cfg_2lm;
}
+EXPORT_SYMBOL_GPL(skx_set_mem_cfg);
+
+void skx_set_res_cfg(struct res_config *cfg)
+{
+ skx_res_cfg = cfg;
+}
+EXPORT_SYMBOL_GPL(skx_set_res_cfg);
void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
{
driver_decode = decode;
skx_show_retry_rd_err_log = show_retry_log;
}
+EXPORT_SYMBOL_GPL(skx_set_decode);
-int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
+static int skx_get_pkg_id(struct skx_dev *d, u8 *id)
{
- u32 reg;
+ int node;
+ int cpu;
- if (pci_read_config_dword(d->util_all, off, &reg)) {
- skx_printk(KERN_ERR, "Failed to read src id\n");
- return -ENODEV;
+ node = pcibus_to_node(d->util_all->bus);
+ if (numa_valid_node(node)) {
+ for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ *id = c->topo.pkg_id;
+ return 0;
+ }
+ }
}
- *id = GET_BITFIELD(reg, 12, 14);
- return 0;
+ skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
+ return -ENODEV;
}
-int skx_get_node_id(struct skx_dev *d, u8 *id)
+int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
{
u32 reg;
- if (pci_read_config_dword(d->util_all, 0xf4, &reg)) {
- skx_printk(KERN_ERR, "Failed to read node id\n");
+ /*
+ * The 3-bit source IDs in PCI configuration space registers are limited
+ * to 8 unique IDs, and each ID is local to a UPI/QPI domain.
+ *
+ * Source IDs cannot be used to map devices to sockets on UV systems
+ * because they can exceed 8 sockets and have multiple UPI/QPI domains
+ * with identical, repeating source IDs.
+ */
+ if (is_uv_system())
+ return skx_get_pkg_id(d, id);
+
+ if (pci_read_config_dword(d->util_all, off, &reg)) {
+ skx_printk(KERN_ERR, "Failed to read src id\n");
return -ENODEV;
}
- *id = GET_BITFIELD(reg, 0, 2);
+ *id = GET_BITFIELD(reg, 12, 14);
return 0;
}
+EXPORT_SYMBOL_GPL(skx_get_src_id);
static int get_width(u32 mtr)
{
@@ -278,12 +358,15 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
list_add_tail(&d->list, &dev_edac_list);
prev = pdev;
+
+ skx_init_mc_mapping(d);
}
if (list)
*list = &dev_edac_list;
return ndev;
}
+EXPORT_SYMBOL_GPL(skx_get_all_bus_mappings);
int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm)
{
@@ -323,6 +406,7 @@ fail:
pci_dev_put(pdev);
return -ENODEV;
}
+EXPORT_SYMBOL_GPL(skx_get_hi_lo);
static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
int minval, int maxval, const char *name)
@@ -355,7 +439,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
if (imc->hbm_mc) {
banks = 32;
mtype = MEM_HBM2;
- } else if (cfg->support_ddr5 && (amap & 0x8)) {
+ } else if (cfg->support_ddr5) {
banks = 32;
mtype = MEM_DDR5;
} else {
@@ -394,6 +478,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
return 1;
}
+EXPORT_SYMBOL_GPL(skx_get_dimm_info);
int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
int chan, int dimmno, const char *mod_str)
@@ -442,6 +527,7 @@ unknown_size:
return (size == 0 || size == ~0ull) ? 0 : 1;
}
+EXPORT_SYMBOL_GPL(skx_get_nvdimm_info);
int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
const char *ctl_name, const char *mod_str,
@@ -474,7 +560,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev,
pvt->imc = imc;
mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name,
- imc->node_id, imc->lmc);
+ imc->src_id, imc->lmc);
if (!mci->ctl_name) {
rc = -ENOMEM;
goto fail0;
@@ -512,6 +598,7 @@ fail0:
imc->mci = NULL;
return rc;
}
+EXPORT_SYMBOL_GPL(skx_register_mci);
static void skx_unregister_mci(struct skx_imc *imc)
{
@@ -609,31 +696,27 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
optype, skx_msg);
}
-static bool skx_error_in_1st_level_mem(const struct mce *m)
+static enum error_source skx_error_source(const struct mce *m)
{
- u32 errcode;
-
- if (!skx_mem_cfg_2lm)
- return false;
+ u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
- errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+ if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR)
+ return ERR_SRC_NOT_MEMORY;
- return errcode == MCACOD_EXT_MEM_ERR;
-}
-
-static bool skx_error_in_mem(const struct mce *m)
-{
- u32 errcode;
+ if (!skx_mem_cfg_2lm)
+ return ERR_SRC_1LM;
- errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+ if (errcode == MCACOD_EXT_MEM_ERR)
+ return ERR_SRC_2LM_NM;
- return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
+ return ERR_SRC_2LM_FM;
}
int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
+ enum error_source err_src;
struct decoded_addr res;
struct mem_ctl_info *mci;
char *type;
@@ -641,14 +724,16 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if (mce->kflags & MCE_HANDLED_CEC)
return NOTIFY_DONE;
+ err_src = skx_error_source(mce);
+
/* Ignore unless this is memory related with an address */
- if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
+ if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;
memset(&res, 0, sizeof(res));
res.mce = mce;
res.addr = mce->addr & MCI_ADDR_PHYSADDR;
- if (!pfn_to_online_page(res.addr >> PAGE_SHIFT)) {
+ if (!pfn_to_online_page(res.addr >> PAGE_SHIFT) && !arch_is_platform_page(res.addr)) {
pr_err("Invalid address 0x%llx in IA32_MC%d_ADDR\n", mce->addr, mce->bank);
return NOTIFY_DONE;
}
@@ -656,7 +741,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
/* Try driver decoder first */
if (!(driver_decode && driver_decode(&res))) {
/* Then try firmware decoder (ACPI DSM methods) */
- if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
+ if (!(adxl_component_count && skx_adxl_decode(&res, err_src)))
return NOTIFY_DONE;
}
@@ -688,6 +773,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
mce->kflags |= MCE_HANDLED_EDAC;
return NOTIFY_DONE;
}
+EXPORT_SYMBOL_GPL(skx_mce_check_error);
void skx_remove(void)
{
@@ -725,3 +811,55 @@ void skx_remove(void)
kfree(d);
}
}
+EXPORT_SYMBOL_GPL(skx_remove);
+
+#ifdef CONFIG_EDAC_DEBUG
+/*
+ * Debug feature.
+ * Exercise the address decode logic by writing an address to
+ * /sys/kernel/debug/edac/{skx,i10nm}_test/addr.
+ */
+static struct dentry *skx_test;
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+ struct mce m;
+
+ pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);
+
+ memset(&m, 0, sizeof(m));
+ /* ADDRV + MemRd + Unknown channel */
+ m.status = MCI_STATUS_ADDRV + 0x90;
+ /* One corrected error */
+ m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
+ m.addr = val;
+ skx_mce_check_error(NULL, 0, &m);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+
+void skx_setup_debug(const char *name)
+{
+ skx_test = edac_debugfs_create_dir(name);
+ if (!skx_test)
+ return;
+
+ if (!edac_debugfs_create_file("addr", 0200, skx_test,
+ NULL, &fops_u64_wo)) {
+ debugfs_remove(skx_test);
+ skx_test = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(skx_setup_debug);
+
+void skx_teardown_debug(void)
+{
+ debugfs_remove_recursive(skx_test);
+}
+EXPORT_SYMBOL_GPL(skx_teardown_debug);
+#endif /*CONFIG_EDAC_DEBUG*/
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tony Luck");
+MODULE_DESCRIPTION("MC Driver for Intel server processors");