diff options
Diffstat (limited to 'drivers/edac/skx_common.c')
-rw-r--r-- | drivers/edac/skx_common.c | 172 |
1 files changed, 138 insertions, 34 deletions
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 9c5b6f8bd8bd..f7bd930e058f 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -19,6 +19,7 @@ #include <linux/adxl.h> #include <acpi/nfit.h> #include <asm/mce.h> +#include <asm/uv/uv.h> #include "edac_module.h" #include "skx_common.h" @@ -47,8 +48,9 @@ static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); static bool skx_mem_cfg_2lm; +static struct res_config *skx_res_cfg; -int __init skx_adxl_get(void) +int skx_adxl_get(void) { const char * const *names; int i, j; @@ -110,14 +112,16 @@ err: return -ENODEV; } +EXPORT_SYMBOL_GPL(skx_adxl_get); -void __exit skx_adxl_put(void) +void skx_adxl_put(void) { kfree(adxl_values); kfree(adxl_msg); } +EXPORT_SYMBOL_GPL(skx_adxl_put); -static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem) +static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { struct skx_dev *d; int i, len = 0; @@ -133,8 +137,24 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me return false; } + /* + * GNR with a Flat2LM memory configuration may mistakenly classify + * a near-memory error(DDR5) as a far-memory error(CXL), resulting + * in the incorrect selection of decoded ADXL components. + * To address this, prefetch the decoded far-memory controller ID + * and adjust the error source to near-memory if the far-memory + * controller ID is invalid. + */ + if (skx_res_cfg && skx_res_cfg->type == GNR && err_src == ERR_SRC_2LM_FM) { + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + if (res->imc == -1) { + err_src = ERR_SRC_2LM_NM; + edac_dbg(0, "Adjust the error source to near-memory.\n"); + } + } + res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; - if (error_in_1st_level_mem) { + if (err_src == ERR_SRC_2LM_NM) { res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1; res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ? @@ -187,38 +207,66 @@ void skx_set_mem_cfg(bool mem_cfg_2lm) { skx_mem_cfg_2lm = mem_cfg_2lm; } +EXPORT_SYMBOL_GPL(skx_set_mem_cfg); + +void skx_set_res_cfg(struct res_config *cfg) +{ + skx_res_cfg = cfg; +} +EXPORT_SYMBOL_GPL(skx_set_res_cfg); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { driver_decode = decode; skx_show_retry_rd_err_log = show_retry_log; } +EXPORT_SYMBOL_GPL(skx_set_decode); -int skx_get_src_id(struct skx_dev *d, int off, u8 *id) +static int skx_get_pkg_id(struct skx_dev *d, u8 *id) { - u32 reg; + int node; + int cpu; - if (pci_read_config_dword(d->util_all, off, ®)) { - skx_printk(KERN_ERR, "Failed to read src id\n"); - return -ENODEV; + node = pcibus_to_node(d->util_all->bus); + if (numa_valid_node(node)) { + for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) { + *id = c->topo.pkg_id; + return 0; + } + } } - *id = GET_BITFIELD(reg, 12, 14); - return 0; + skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n"); + return -ENODEV; } -int skx_get_node_id(struct skx_dev *d, u8 *id) +int skx_get_src_id(struct skx_dev *d, int off, u8 *id) { u32 reg; - if (pci_read_config_dword(d->util_all, 0xf4, ®)) { - skx_printk(KERN_ERR, "Failed to read node id\n"); + /* + * The 3-bit source IDs in PCI configuration space registers are limited + * to 8 unique IDs, and each ID is local to a UPI/QPI domain. + * + * Source IDs cannot be used to map devices to sockets on UV systems + * because they can exceed 8 sockets and have multiple UPI/QPI domains + * with identical, repeating source IDs. + */ + if (is_uv_system()) + return skx_get_pkg_id(d, id); + + if (pci_read_config_dword(d->util_all, off, ®)) { + skx_printk(KERN_ERR, "Failed to read src id\n"); return -ENODEV; } - *id = GET_BITFIELD(reg, 0, 2); + *id = GET_BITFIELD(reg, 12, 14); return 0; } +EXPORT_SYMBOL_GPL(skx_get_src_id); static int get_width(u32 mtr) { @@ -284,6 +332,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) *list = &dev_edac_list; return ndev; } +EXPORT_SYMBOL_GPL(skx_get_all_bus_mappings); int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm) { @@ -323,6 +372,7 @@ fail: pci_dev_put(pdev); return -ENODEV; } +EXPORT_SYMBOL_GPL(skx_get_hi_lo); static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, int minval, int maxval, const char *name) @@ -355,7 +405,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, if (imc->hbm_mc) { banks = 32; mtype = MEM_HBM2; - } else if (cfg->support_ddr5 && (amap & 0x8)) { + } else if (cfg->support_ddr5) { banks = 32; mtype = MEM_DDR5; } else { @@ -394,6 +444,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, return 1; } +EXPORT_SYMBOL_GPL(skx_get_dimm_info); int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, const char *mod_str) @@ -442,6 +493,7 @@ unknown_size: return (size == 0 || size == ~0ull) ? 0 : 1; } +EXPORT_SYMBOL_GPL(skx_get_nvdimm_info); int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, const char *ctl_name, const char *mod_str, @@ -474,7 +526,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, pvt->imc = imc; mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name, - imc->node_id, imc->lmc); + imc->src_id, imc->lmc); if (!mci->ctl_name) { rc = -ENOMEM; goto fail0; @@ -512,6 +564,7 @@ fail0: imc->mci = NULL; return rc; } +EXPORT_SYMBOL_GPL(skx_register_mci); static void skx_unregister_mci(struct skx_imc *imc) { @@ -609,31 +662,27 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, skx_msg); } -static bool skx_error_in_1st_level_mem(const struct mce *m) +static enum error_source skx_error_source(const struct mce *m) { - u32 errcode; - - if (!skx_mem_cfg_2lm) - return false; + u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; - errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; - - return errcode == MCACOD_EXT_MEM_ERR; -} + if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR) + return ERR_SRC_NOT_MEMORY; -static bool skx_error_in_mem(const struct mce *m) -{ - u32 errcode; + if (!skx_mem_cfg_2lm) + return ERR_SRC_1LM; - errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; + if (errcode == MCACOD_EXT_MEM_ERR) + return ERR_SRC_2LM_NM; - return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR); + return ERR_SRC_2LM_FM; } int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { struct mce *mce = (struct mce *)data; + enum error_source err_src; struct decoded_addr res; struct mem_ctl_info *mci; char *type; @@ -641,14 +690,16 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; + err_src = skx_error_source(mce); + /* Ignore unless this is memory related with an address */ - if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV)) + if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; memset(&res, 0, sizeof(res)); res.mce = mce; res.addr = mce->addr & MCI_ADDR_PHYSADDR; - if (!pfn_to_online_page(res.addr >> PAGE_SHIFT)) { + if (!pfn_to_online_page(res.addr >> PAGE_SHIFT) && !arch_is_platform_page(res.addr)) { pr_err("Invalid address 0x%llx in IA32_MC%d_ADDR\n", mce->addr, mce->bank); return NOTIFY_DONE; } @@ -656,7 +707,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, /* Try driver decoder first */ if (!(driver_decode && driver_decode(&res))) { /* Then try firmware decoder (ACPI DSM methods) */ - if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))) + if (!(adxl_component_count && skx_adxl_decode(&res, err_src))) return NOTIFY_DONE; } @@ -688,6 +739,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, mce->kflags |= MCE_HANDLED_EDAC; return NOTIFY_DONE; } +EXPORT_SYMBOL_GPL(skx_mce_check_error); void skx_remove(void) { @@ -725,3 +777,55 @@ void skx_remove(void) kfree(d); } } +EXPORT_SYMBOL_GPL(skx_remove); + +#ifdef CONFIG_EDAC_DEBUG +/* + * Debug feature. + * Exercise the address decode logic by writing an address to + * /sys/kernel/debug/edac/{skx,i10nm}_test/addr. + */ +static struct dentry *skx_test; + +static int debugfs_u64_set(void *data, u64 val) +{ + struct mce m; + + pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val); + + memset(&m, 0, sizeof(m)); + /* ADDRV + MemRd + Unknown channel */ + m.status = MCI_STATUS_ADDRV + 0x90; + /* One corrected error */ + m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT); + m.addr = val; + skx_mce_check_error(NULL, 0, &m); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); + +void skx_setup_debug(const char *name) +{ + skx_test = edac_debugfs_create_dir(name); + if (!skx_test) + return; + + if (!edac_debugfs_create_file("addr", 0200, skx_test, + NULL, &fops_u64_wo)) { + debugfs_remove(skx_test); + skx_test = NULL; + } +} +EXPORT_SYMBOL_GPL(skx_setup_debug); + +void skx_teardown_debug(void) +{ + debugfs_remove_recursive(skx_test); +} +EXPORT_SYMBOL_GPL(skx_teardown_debug); +#endif /*CONFIG_EDAC_DEBUG*/ + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Tony Luck"); +MODULE_DESCRIPTION("MC Driver for Intel server processors"); |