diff options
Diffstat (limited to 'drivers/acpi/apei/ghes.c')
-rw-r--r-- | drivers/acpi/apei/ghes.c | 225 |
1 files changed, 164 insertions, 61 deletions
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 7b7c605166e0..f0584ccad451 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -26,7 +26,7 @@ #include <linux/interrupt.h> #include <linux/timer.h> #include <linux/cper.h> -#include <linux/cxl-event.h> +#include <linux/cleanup.h> #include <linux/platform_device.h> #include <linux/mutex.h> #include <linux/ratelimit.h> @@ -34,6 +34,7 @@ #include <linux/irq_work.h> #include <linux/llist.h> #include <linux/genalloc.h> +#include <linux/kfifo.h> #include <linux/pci.h> #include <linux/pfn.h> #include <linux/aer.h> @@ -48,6 +49,7 @@ #include <acpi/apei.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> +#include <cxl/event.h> #include <ras/ras_event.h> #include "apei-internal.h" @@ -171,8 +173,6 @@ static struct gen_pool *ghes_estatus_pool; static struct ghes_estatus_cache __rcu *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; static atomic_t ghes_estatus_cache_alloced; -static int ghes_panic_timeout __read_mostly = 30; - static void __iomem *ghes_map(u64 pfn, enum fixed_addresses fixmap_idx) { phys_addr_t paddr; @@ -674,41 +674,118 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } -/* - * Only a single callback can be registered for CXL CPER events. - */ -static DECLARE_RWSEM(cxl_cper_rw_sem); -static cxl_cper_callback cper_callback; +/* Room for 8 entries */ +#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8 +static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data, + CXL_CPER_PROT_ERR_FIFO_DEPTH); -/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ +/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */ +static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock); +struct work_struct *cxl_cper_prot_err_work; -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CPER_SEC_CXL_GEN_MEDIA_GUID \ - GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ - 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) +static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ +#ifdef CONFIG_ACPI_APEI_PCIEAER + struct cxl_cper_prot_err_work_data wd; + u8 *dvsec_start, *cap_start; -/* - * DRAM Event Record - * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 - */ -#define CPER_SEC_CXL_DRAM_GUID \ - GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ - 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) + if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { + pr_err_ratelimited("CXL CPER invalid agent type\n"); + return; + } -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -#define CPER_SEC_CXL_MEM_MODULE_GUID \ - GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ - 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { + pr_err_ratelimited("CXL CPER invalid protocol error log\n"); + return; + } + + if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { + pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", + prot_err->err_len); + return; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) + pr_warn(FW_WARN "CXL CPER no device serial number\n"); + + guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock); + + if (!cxl_cper_prot_err_work) + return; + + switch (prot_err->agent_type) { + case RCD: + case DEVICE: + case LD: + case FMLD: + case RP: + case DSP: + case USP: + memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err)); + + dvsec_start = (u8 *)(prot_err + 1); + cap_start = dvsec_start + prot_err->dvsec_len; + + memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap)); + wd.severity = cper_severity_to_aer(severity); + break; + default: + pr_err_ratelimited("CXL CPER invalid agent type: %d\n", + prot_err->agent_type); + return; + } + + if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) { + pr_err_ratelimited("CXL CPER kfifo overflow\n"); + return; + } + + schedule_work(cxl_cper_prot_err_work); +#endif +} + +int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = work; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL"); + +int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work != work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = NULL; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL"); + +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return kfifo_get(&cxl_cper_prot_err_fifo, wd); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL"); + +/* Room for 8 entries for each of the 4 event log queues */ +#define CXL_CPER_FIFO_DEPTH 32 +DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH); + +/* Synchronize schedule_work() with cxl_cper_work changes */ +static DEFINE_SPINLOCK(cxl_cper_work_lock); +struct work_struct *cxl_cper_work; static void cxl_cper_post_event(enum cxl_event_type event_type, struct cxl_cper_event_rec *rec) { + struct cxl_cper_work_data wd; + if (rec->hdr.length <= sizeof(rec->hdr) || rec->hdr.length > sizeof(*rec)) { pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n", @@ -721,30 +798,49 @@ static void cxl_cper_post_event(enum cxl_event_type event_type, return; } - guard(rwsem_read)(&cxl_cper_rw_sem); - if (cper_callback) - cper_callback(event_type, rec); + guard(spinlock_irqsave)(&cxl_cper_work_lock); + + if (!cxl_cper_work) + return; + + wd.event_type = event_type; + memcpy(&wd.rec, rec, sizeof(wd.rec)); + + if (!kfifo_put(&cxl_cper_fifo, wd)) { + pr_err_ratelimited("CXL CPER kfifo overflow\n"); + return; + } + + schedule_work(cxl_cper_work); } -int cxl_cper_register_callback(cxl_cper_callback callback) +int cxl_cper_register_work(struct work_struct *work) { - guard(rwsem_write)(&cxl_cper_rw_sem); - if (cper_callback) + if (cxl_cper_work) return -EINVAL; - cper_callback = callback; + + guard(spinlock)(&cxl_cper_work_lock); + cxl_cper_work = work; return 0; } -EXPORT_SYMBOL_NS_GPL(cxl_cper_register_callback, CXL); +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_work, "CXL"); -int cxl_cper_unregister_callback(cxl_cper_callback callback) +int cxl_cper_unregister_work(struct work_struct *work) { - guard(rwsem_write)(&cxl_cper_rw_sem); - if (callback != cper_callback) + if (cxl_cper_work != work) return -EINVAL; - cper_callback = NULL; + + guard(spinlock)(&cxl_cper_work_lock); + cxl_cper_work = NULL; return 0; } -EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_callback, CXL); +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_work, "CXL"); + +int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) +{ + return kfifo_get(&cxl_cper_fifo, wd); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL"); static bool ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) @@ -780,20 +876,20 @@ static bool ghes_do_proc(struct ghes *ghes, } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev, sync); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { + struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); + + cxl_cper_post_prot_err(prot_err, gdata->error_severity); } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); + struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata); cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec); } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); + struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata); cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec); - } else if (guid_equal(sec_type, - &CPER_SEC_CXL_MEM_MODULE_GUID)) { - struct cxl_cper_event_rec *rec = - acpi_hest_get_payload(gdata); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) { + struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata); cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec); } else { @@ -988,14 +1084,16 @@ static void __ghes_panic(struct ghes *ghes, struct acpi_hest_generic_status *estatus, u64 buf_paddr, enum fixed_addresses fixmap_idx) { + const char *msg = GHES_PFX "Fatal hardware error"; + __ghes_print_estatus(KERN_EMERG, ghes->generic, estatus); ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx); - /* reboot to log the error! */ if (!panic_timeout) - panic_timeout = ghes_panic_timeout; - panic("Fatal hardware error!"); + pr_emerg("%s but panic disabled\n", msg); + + panic(msg); } static int ghes_proc(struct ghes *ghes) @@ -1040,7 +1138,7 @@ static void ghes_add_timer(struct ghes *ghes) static void ghes_poll_func(struct timer_list *t) { - struct ghes *ghes = from_timer(ghes, t, timer); + struct ghes *ghes = timer_container_of(ghes, t, timer); unsigned long flags; spin_lock_irqsave(&ghes_notify_lock_irq, flags); @@ -1544,7 +1642,7 @@ err: return rc; } -static int ghes_remove(struct platform_device *ghes_dev) +static void ghes_remove(struct platform_device *ghes_dev) { int rc; struct ghes *ghes; @@ -1581,8 +1679,15 @@ static int ghes_remove(struct platform_device *ghes_dev) break; case ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED: rc = apei_sdei_unregister_ghes(ghes); - if (rc) - return rc; + if (rc) { + /* + * Returning early results in a resource leak, but we're + * only here if stopping the hardware failed. + */ + dev_err(&ghes_dev->dev, "Failed to unregister ghes (%pe)\n", + ERR_PTR(rc)); + return; + } break; default: BUG(); @@ -1596,8 +1701,6 @@ static int ghes_remove(struct platform_device *ghes_dev) mutex_unlock(&ghes_devs_mutex); kfree(ghes); - - return 0; } static struct platform_driver ghes_platform_driver = { @@ -1612,7 +1715,7 @@ void __init acpi_ghes_init(void) { int rc; - sdei_init(); + acpi_sdei_init(); if (acpi_disabled) return; |