diff options
Diffstat (limited to 'include/ras/ras_event.h')
| -rw-r--r-- | include/ras/ras_event.h | 293 |
1 files changed, 288 insertions, 5 deletions
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 21cdb0b7b0fb..eaecc3c5f772 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ras #define TRACE_INCLUDE_FILE ras_event @@ -8,6 +9,72 @@ #include <linux/tracepoint.h> #include <linux/edac.h> #include <linux/ktime.h> +#include <linux/pci.h> +#include <linux/aer.h> +#include <linux/cper.h> + +/* + * MCE Extended Error Log trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event. + */ + +/* memory trace event */ + +#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) +TRACE_EVENT(extlog_mem_event, + TP_PROTO(struct cper_sec_mem_err *mem, + u32 err_seq, + const guid_t *fru_id, + const char *fru_text, + u8 sev), + + TP_ARGS(mem, err_seq, fru_id, fru_text, sev), + + TP_STRUCT__entry( + __field(u32, err_seq) + __field(u8, etype) + __field(u8, sev) + __field(u64, pa) + __field(u8, pa_mask_lsb) + __field_struct(guid_t, fru_id) + __string(fru_text, fru_text) + __field_struct(struct cper_mem_err_compact, data) + ), + + TP_fast_assign( + __entry->err_seq = err_seq; + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) + __entry->etype = mem->error_type; + else + __entry->etype = ~0; + __entry->sev = sev; + if (mem->validation_bits & CPER_MEM_VALID_PA) + __entry->pa = mem->physical_addr; + else + __entry->pa = ~0ull; + + if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) + __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask); + else + __entry->pa_mask_lsb = ~0; + __entry->fru_id = *fru_id; + __assign_str(fru_text); + cper_mem_err_pack(mem, &__entry->data); + ), + + TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s", + __entry->err_seq, + cper_severity_str(__entry->sev), + cper_mem_err_type_str(__entry->etype), + __entry->pa, + __entry->pa_mask_lsb, + cper_mem_err_unpack(p, &__entry->data), + &__entry->fru_id, + __get_str(fru_text)) +); +#endif /* * Hardware Events Report @@ -63,8 +130,8 @@ TRACE_EVENT(mc_event, TP_fast_assign( __entry->error_type = err_type; - __assign_str(msg, error_msg); - __assign_str(label, label); + __assign_str(msg); + __assign_str(label); __entry->error_count = error_count; __entry->mc_index = mc_index; __entry->top_layer = top_layer; @@ -73,14 +140,14 @@ TRACE_EVENT(mc_event, __entry->address = address; __entry->grain_bits = grain_bits; __entry->syndrome = syndrome; - __assign_str(driver_detail, driver_detail); + __assign_str(driver_detail); ), TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", __entry->error_count, mc_event_error_type(__entry->error_type), __entry->error_count > 1 ? "s" : "", - ((char *)__get_str(msg))[0] ? " " : "", + __get_str(msg)[0] ? " " : "", __get_str(msg), __get_str(label), __entry->mc_index, @@ -90,10 +157,226 @@ TRACE_EVENT(mc_event, __entry->address, 1 << __entry->grain_bits, __entry->syndrome, - ((char *)__get_str(driver_detail))[0] ? " " : "", + __get_str(driver_detail)[0] ? " " : "", __get_str(driver_detail)) ); +/* + * ARM Processor Events Report + * + * This event is generated when hardware detects an ARM processor error + * has occurred. UEFI 2.6 spec section N.2.4.4. + */ +#define APEIL "ARM Processor Err Info data len" +#define APEID "ARM Processor Err Info raw data" +#define APECIL "ARM Processor Err Context Info data len" +#define APECID "ARM Processor Err Context Info raw data" +#define VSEIL "Vendor Specific Err Info data len" +#define VSEID "Vendor Specific Err Info raw data" +TRACE_EVENT(arm_event, + + TP_PROTO(const struct cper_sec_proc_arm *proc, + const u8 *pei_err, + const u32 pei_len, + const u8 *ctx_err, + const u32 ctx_len, + const u8 *oem, + const u32 oem_len, + u8 sev, + int cpu), + + TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu), + + TP_STRUCT__entry( + __field(u64, mpidr) + __field(u64, midr) + __field(u32, running_state) + __field(u32, psci_state) + __field(u8, affinity) + __field(u32, pei_len) + __dynamic_array(u8, pei_buf, pei_len) + __field(u32, ctx_len) + __dynamic_array(u8, ctx_buf, ctx_len) + __field(u32, oem_len) + __dynamic_array(u8, oem_buf, oem_len) + __field(u8, sev) + __field(int, cpu) + ), + + TP_fast_assign( + if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL) + __entry->affinity = proc->affinity_level; + else + __entry->affinity = ~0; + if (proc->validation_bits & CPER_ARM_VALID_MPIDR) + __entry->mpidr = proc->mpidr; + else + __entry->mpidr = 0ULL; + __entry->midr = proc->midr; + if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) { + __entry->running_state = proc->running_state; + __entry->psci_state = proc->psci_state; + } else { + __entry->running_state = ~0; + __entry->psci_state = ~0; + } + __entry->pei_len = pei_len; + memcpy(__get_dynamic_array(pei_buf), pei_err, pei_len); + __entry->ctx_len = ctx_len; + memcpy(__get_dynamic_array(ctx_buf), ctx_err, ctx_len); + __entry->oem_len = oem_len; + memcpy(__get_dynamic_array(oem_buf), oem, oem_len); + __entry->sev = sev; + __entry->cpu = cpu; + ), + + TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " + "running state: %d; PSCI state: %d; " + "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", + __entry->cpu, + __entry->sev, + __entry->affinity, __entry->mpidr, __entry->midr, + __entry->running_state, __entry->psci_state, + APEIL, __entry->pei_len, APEID, + __print_hex(__get_dynamic_array(pei_buf), __entry->pei_len), + APECIL, __entry->ctx_len, APECID, + __print_hex(__get_dynamic_array(ctx_buf), __entry->ctx_len), + VSEIL, __entry->oem_len, VSEID, + __print_hex(__get_dynamic_array(oem_buf), __entry->oem_len)) +); + +/* + * Non-Standard Section Report + * + * This event is generated when hardware detected a hardware + * error event, which may be of non-standard section as defined + * in UEFI spec appendix "Common Platform Error Record", or may + * be of sections for which TRACE_EVENT is not defined. + * + */ +TRACE_EVENT(non_standard_event, + + TP_PROTO(const guid_t *sec_type, + const guid_t *fru_id, + const char *fru_text, + const u8 sev, + const u8 *err, + const u32 len), + + TP_ARGS(sec_type, fru_id, fru_text, sev, err, len), + + TP_STRUCT__entry( + __array(char, sec_type, UUID_SIZE) + __array(char, fru_id, UUID_SIZE) + __string(fru_text, fru_text) + __field(u8, sev) + __field(u32, len) + __dynamic_array(u8, buf, len) + ), + + TP_fast_assign( + memcpy(__entry->sec_type, sec_type, UUID_SIZE); + memcpy(__entry->fru_id, fru_id, UUID_SIZE); + __assign_str(fru_text); + __entry->sev = sev; + __entry->len = len; + memcpy(__get_dynamic_array(buf), err, len); + ), + + TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s", + __entry->sev, __entry->sec_type, + __entry->fru_id, __get_str(fru_text), + __entry->len, + __print_hex(__get_dynamic_array(buf), __entry->len)) +); + +#ifdef CONFIG_PCIEAER +/* + * PCIe AER Trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event on a PCIe device. The event report has + * the following structure: + * + * char * dev_name - The name of the slot where the device resides + * ([domain:]bus:device.function). + * u32 status - Either the correctable or uncorrectable register + * indicating what error or errors have been seen + * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED + */ + +#define aer_correctable_errors \ + {PCI_ERR_COR_RCVR, "Receiver Error"}, \ + {PCI_ERR_COR_BAD_TLP, "Bad TLP"}, \ + {PCI_ERR_COR_BAD_DLLP, "Bad DLLP"}, \ + {PCI_ERR_COR_REP_ROLL, "RELAY_NUM Rollover"}, \ + {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"}, \ + {PCI_ERR_COR_ADV_NFAT, "Advisory Non-Fatal Error"}, \ + {PCI_ERR_COR_INTERNAL, "Corrected Internal Error"}, \ + {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"} + +#define aer_uncorrectable_errors \ + {PCI_ERR_UNC_UND, "Undefined"}, \ + {PCI_ERR_UNC_DLP, "Data Link Protocol Error"}, \ + {PCI_ERR_UNC_SURPDN, "Surprise Down Error"}, \ + {PCI_ERR_UNC_POISON_TLP,"Poisoned TLP"}, \ + {PCI_ERR_UNC_FCP, "Flow Control Protocol Error"}, \ + {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"}, \ + {PCI_ERR_UNC_COMP_ABORT,"Completer Abort"}, \ + {PCI_ERR_UNC_UNX_COMP, "Unexpected Completion"}, \ + {PCI_ERR_UNC_RX_OVER, "Receiver Overflow"}, \ + {PCI_ERR_UNC_MALF_TLP, "Malformed TLP"}, \ + {PCI_ERR_UNC_ECRC, "ECRC Error"}, \ + {PCI_ERR_UNC_UNSUP, "Unsupported Request Error"}, \ + {PCI_ERR_UNC_ACSV, "ACS Violation"}, \ + {PCI_ERR_UNC_INTN, "Uncorrectable Internal Error"},\ + {PCI_ERR_UNC_MCBTLP, "MC Blocked TLP"}, \ + {PCI_ERR_UNC_ATOMEG, "AtomicOp Egress Blocked"}, \ + {PCI_ERR_UNC_TLPPRE, "TLP Prefix Blocked Error"} + +TRACE_EVENT(aer_event, + TP_PROTO(const char *dev_name, + const u32 status, + const u8 severity, + const u8 tlp_header_valid, + struct pcie_tlp_log *tlp), + + TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), + + TP_STRUCT__entry( + __string( dev_name, dev_name ) + __field( u32, status ) + __field( u8, severity ) + __field( u8, tlp_header_valid) + __array( u32, tlp_header, PCIE_STD_MAX_TLP_HEADERLOG) + ), + + TP_fast_assign( + __assign_str(dev_name); + __entry->status = status; + __entry->severity = severity; + __entry->tlp_header_valid = tlp_header_valid; + if (tlp_header_valid) { + int i; + + for (i = 0; i < PCIE_STD_MAX_TLP_HEADERLOG; i++) + __entry->tlp_header[i] = tlp->dw[i]; + } + ), + + TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", + __get_str(dev_name), + __entry->severity == AER_CORRECTABLE ? "Corrected" : + __entry->severity == AER_FATAL ? + "Fatal" : "Uncorrected, non-fatal", + __entry->severity == AER_CORRECTABLE ? + __print_flags(__entry->status, "|", aer_correctable_errors) : + __print_flags(__entry->status, "|", aer_uncorrectable_errors), + __entry->tlp_header_valid ? + __print_array(__entry->tlp_header, PCIE_STD_MAX_TLP_HEADERLOG, 4) : + "Not available") +); +#endif /* CONFIG_PCIEAER */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ |
