summaryrefslogtreecommitdiff
path: root/drivers/edac
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-30 19:23:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-30 19:23:00 -0700
commitbb511d4b25a75bd67c4db4dcb570b2ca3b42c926 (patch)
treef18346062c11ddb0ab2d5e1561691ac93d6f297f /drivers/edac
parenta55b0a028877e9d7e7dacdbe363d39390554ba14 (diff)
parentce53ad81ed36c24aff075f94474adecfabfcf239 (diff)
Merge tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull intel EDAC fixes from Tony Luck: - Old igen6 driver could lose pending events during initialization - Sapphire Rapids workstations have fewer memory controllers than their bigger siblings. This confused the driver. * tag 'edac_updates_for_v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras: EDAC/igen6: Fix the issue of no error events EDAC/i10nm: Skip the absent memory controllers
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/i10nm_base.c54
-rw-r--r--drivers/edac/igen6_edac.c8
2 files changed, 53 insertions, 9 deletions
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 5abf997ca7c1..2b83d6de9352 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -658,13 +658,49 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi
return mdev;
}
+/**
+ * i10nm_imc_absent() - Check whether the memory controller @imc is absent
+ *
+ * @imc : The pointer to the structure of memory controller EDAC device.
+ *
+ * RETURNS : true if the memory controller EDAC device is absent, false otherwise.
+ */
+static bool i10nm_imc_absent(struct skx_imc *imc)
+{
+ u32 mcmtr;
+ int i;
+
+ switch (res_cfg->type) {
+ case SPR:
+ for (i = 0; i < res_cfg->ddr_chan_num; i++) {
+ mcmtr = I10NM_GET_MCMTR(imc, i);
+ edac_dbg(1, "ch%d mcmtr reg %x\n", i, mcmtr);
+ if (mcmtr != ~0)
+ return false;
+ }
+
+ /*
+ * Some workstations' absent memory controllers still
+ * appear as PCIe devices, misleading the EDAC driver.
+ * By observing that the MMIO registers of these absent
+ * memory controllers consistently hold the value of ~0.
+ *
+ * We identify a memory controller as absent by checking
+ * if its MMIO register "mcmtr" == ~0 in all its channels.
+ */
+ return true;
+ default:
+ return false;
+ }
+}
+
static int i10nm_get_ddr_munits(void)
{
struct pci_dev *mdev;
void __iomem *mbase;
unsigned long size;
struct skx_dev *d;
- int i, j = 0;
+ int i, lmc, j = 0;
u32 reg, off;
u64 base;
@@ -690,7 +726,7 @@ static int i10nm_get_ddr_munits(void)
edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
j++, base, reg);
- for (i = 0; i < res_cfg->ddr_imc_num; i++) {
+ for (lmc = 0, i = 0; i < res_cfg->ddr_imc_num; i++) {
mdev = get_ddr_munit(d, i, &off, &size);
if (i == 0 && !mdev) {
@@ -700,8 +736,6 @@ static int i10nm_get_ddr_munits(void)
if (!mdev)
continue;
- d->imc[i].mdev = mdev;
-
edac_dbg(2, "mc%d mmio base 0x%llx size 0x%lx (reg 0x%x)\n",
i, base + off, size, reg);
@@ -712,7 +746,17 @@ static int i10nm_get_ddr_munits(void)
return -ENODEV;
}
- d->imc[i].mbase = mbase;
+ d->imc[lmc].mbase = mbase;
+ if (i10nm_imc_absent(&d->imc[lmc])) {
+ pci_dev_put(mdev);
+ iounmap(mbase);
+ d->imc[lmc].mbase = NULL;
+ edac_dbg(2, "Skip absent mc%d\n", i);
+ continue;
+ } else {
+ d->imc[lmc].mdev = mdev;
+ lmc++;
+ }
}
}
diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 544dd19072ea..1a18693294db 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -27,7 +27,7 @@
#include "edac_mc.h"
#include "edac_module.h"
-#define IGEN6_REVISION "v2.5"
+#define IGEN6_REVISION "v2.5.1"
#define EDAC_MOD_STR "igen6_edac"
#define IGEN6_NMI_NAME "igen6_ibecc"
@@ -1216,9 +1216,6 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
INIT_WORK(&ecclog_work, ecclog_work_cb);
init_irq_work(&ecclog_irq_work, ecclog_irq_work_cb);
- /* Check if any pending errors before registering the NMI handler */
- ecclog_handler();
-
rc = register_err_handler();
if (rc)
goto fail3;
@@ -1230,6 +1227,9 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
goto fail4;
}
+ /* Check if any pending errors before/during the registration of the error handler */
+ ecclog_handler();
+
igen6_debug_setup();
return 0;
fail4: