summaryrefslogtreecommitdiff
path: root/drivers/edac/skx_common.c
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2019-08-15 14:53:28 -0700
committerTony Luck <tony.luck@intel.com>2019-10-18 15:27:58 -0700
commite80634a75aba90e7485cd1fdb463fcac5d45f14d (patch)
tree94357e0ec240c2fd39429d27f700f6e361bb74c5 /drivers/edac/skx_common.c
parent29b8e84fbc23cb2b70317b745641ea0569426872 (diff)
EDAC, skx: Retrieve and print retry_rd_err_log registers
Skylake logs some additional useful information in per-channel registers in addition the the architectural status/addr/misc logged in the machine check bank. Pick up this information and add it to the EDAC log: retry_rd_err_[five 32-bit register values] Sorry, no definitions for these registers. OEMs and DIMM vendors will be able to use them to isolate which cells in the DIMM are causing problems. correrrcnt[per rank corrected error counts] Note that if additional errors are logged while these registers are being read, you may see a jumble of values some from earlier errors, others from later errors (since the registers report the most recent logged error). The correrrcnt registers provide error counts per possible rank. If these counts only change by one since the previous error logged for this channel, then it is safe to assume that the registers logged provide a coherent view of one error. With this change EDAC logs look like this: EDAC MC4: 1 CE memory read error on CPU_SrcID#2_MC#0_Chan#1_DIMM#0 (channel:1 slot:0 page:0x8f26018 offset:0x0 grain:32 syndrome:0x0 - err_code:0x0101:0x0091 socket:2 imc:0 rank:0 bg:0 ba:0 row:0x1f880 col:0x200 retry_rd_err_log[0001a209 00000000 00000001 04800001 0001f880] correrrcnt[0001 0000 0000 0000 0000 0000 0000 0000]) Acked-by: Aristeu Rozanski <aris@redhat.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'drivers/edac/skx_common.c')
-rw-r--r--drivers/edac/skx_common.c12
1 files changed, 9 insertions, 3 deletions
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index de06d58d7d2a..95662a4ff4c4 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -37,6 +37,7 @@ static char *adxl_msg;
static char skx_msg[MSG_SIZE];
static skx_decode_f skx_decode;
+static skx_show_retry_log_f skx_show_retry_rd_err_log;
static u64 skx_tolm, skx_tohm;
static LIST_HEAD(dev_edac_list);
@@ -150,9 +151,10 @@ static bool skx_adxl_decode(struct decoded_addr *res)
return true;
}
-void skx_set_decode(skx_decode_f decode)
+void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
{
skx_decode = decode;
+ skx_show_retry_rd_err_log = show_retry_log;
}
int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
@@ -481,6 +483,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
bool recoverable;
+ int len;
u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
u32 mscod = GET_BITFIELD(m->status, 16, 31);
u32 errcode = GET_BITFIELD(m->status, 0, 15);
@@ -537,12 +540,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
}
}
if (adxl_component_count) {
- snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
+ len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
mscod, errcode, adxl_msg);
} else {
- snprintf(skx_msg, MSG_SIZE,
+ len = snprintf(skx_msg, MSG_SIZE,
"%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
@@ -551,6 +554,9 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
res->bank_group, res->bank_address, res->row, res->column);
}
+ if (skx_show_retry_rd_err_log)
+ skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+
edac_dbg(0, "%s\n", skx_msg);
/* Call the helper to output message */